Created
November 22, 2014 02:14
-
-
Save clojens/8fc00c0357e06f0ec00f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use English; # things like MATCH | |
# -------------------------------------------------------------- | |
# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho | |
# the generated phoneme file is suitable for use with MBROLA, | |
# but you have to use the -e option in MBROLA to skip over | |
# spurious unpronounceable phoneme pairs (e.g. caused by English | |
# words in your Dutch text file!). | |
# Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play" | |
# This is GPLed software (open source freeware) by | |
# Eric Auer <[email protected]>, the license is the GNU GPL | |
# version 2 or later, also available as copying.txt in this | |
# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002) | |
# Please give me some feedback: As I am no native speaker | |
# of Dutch, this txt2phoNL definitely need some improvement! | |
# -------------------------------------------------------------- | |
# new version 14 feb 2002: | |
# - sanitize away illegal phone pairs in a last step, | |
# includes devoicing of consonants before a break. | |
# - intermediate repn uses one char per phoneme. | |
# - simpler rewrite mechanism eats all matched chars | |
# and produces only phones - so the text string is constant. | |
# BUT: restart from " " if the rule input ended in " " ! | |
# - steps: 1. digit/... names | |
# 2. sound pattern rules (preferring long matches, | |
# walking the string and trying all rules per char) | |
# 3. sanitize and get final repn from intermediate one | |
# new version 2003-04-05 by Marc Spoorendonk [email protected] (native Dutch speaker) | |
# - changed to much to mention. Very acceptable translation now. | |
my $XLATEDEBUG = 4; # show all translation rule applications | |
# of at least this size | |
# special one char repn: | |
# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5, | |
# oi is 6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question | |
# , is comma | |
open(STRING,">/dev/stderr") || die "cannot open debug log\n"; | |
# open(STRING,">nl2pho.log") || die "cannot open debug log\n"; | |
my $foo; | |
$OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every | |
# write/print, do not buffer output | |
$/ = undef; # do not split on line breaks | |
# $/ is $RS, record separator in use English | |
my $text0 = <STDIN>; # read stdin | |
my $text = " "; # other stage (start with a space) | |
my $phones = " "; # phoneme one-char-per-phoneme repn | |
# by the way: a "^>*" remover would be nice for mails... | |
# g vs G vs x: regen [reG@n] goal [goL] gage [xaZe] | |
# where the G (voiced "ch") is a dialect alternative to x ("ch") | |
# and the g only occurs in foreign words. | |
# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver] pet [pEt] | |
# this is the len: e is long, is ee or e-at-end-of-syll. | |
# E is short, is default, kind of. | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# first step: reduce the alphabet by spelling out specials | |
# result: a plain [a-z.? ]* string | |
my %special = ("0","null", "=","is", | |
"1","een", "!","!", | |
"2","twee", '"',"aanhaalingsteken", | |
"3","drie", | |
"4","vier", "\$","dollar", | |
"5","vijf", "%","procent", | |
"6","zes", "&","en", | |
"7","zeven", "/","slesh", #phonetically | |
"8","acht", "(","haakje openen", | |
"9","negen", ")","haakje sluiten,", | |
"*","ster", "\\","beckslesh", #phonetically | |
"+","plus", "?","?", | |
"#","hekje", "|","paip", #phonetically | |
".",".", "_","underscoor", #phonetically | |
",",",", "-","", | |
">","groter", ";",";", | |
"<","kleiner",":",":", | |
"^","dakje", "@","aapestaartje", | |
"°","grad", "{","accolade openen", | |
"[","hoekje", "]","hoekje sluiten,", | |
"~","tilde", "}","accolade sluiten," | |
); | |
# use this: punt. koma, vraagteken? | |
# or that: . , ? | |
# the latter has the problem that a . or , or ? | |
# surrounded by spaces just sounds like a space... | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
#Marc> Prefix with space for easyer matching. | |
$text0 =~ s/^/ /g; | |
$text0 =~ s/$/ /g; | |
$text0 =~ s/^[>]*//g; # un-mailify the text :-) | |
$text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff | |
$text0 =~ s{:-[)]}{,lachend gezicht}g; # smiley | |
$text0 =~ s{:[)]}{,lachend gezicht}g; # smiley | |
$text0 =~ s{:-[(]}{,treurig gezicht}g; # smiley | |
$text0 =~ s{:[(]}{,treurig gezicht}g; # smiley | |
$text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley | |
$text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley | |
$text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie | |
#Marc> betaal -> betaal | |
#Marc> betalen -> betaalen | |
#Marc> It keeps metten, marren, matten as they are. | |
#Marc> betaling -> betaaling | |
# b e t a l i ng bet a a li ng | |
$text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g; | |
#Marc> meten -> meeten maren -> maaren | |
# m e t e n m e e ten | |
$text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g; | |
#Marc> k.n.m.i. -> k n m i | |
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g; | |
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g; | |
#Marc> remove lines from input: "-----------------------------" -> "" | |
$text0 =~ s/[-_=+]{3,}//g; | |
#Marc> www.bla.com -> www punt bla punt com | |
$text0 =~ s/\.([^ \n\t])/punt $1/g; | |
#Marc> translate some numbers. (write a function for this once) | |
$text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g; | |
$text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g; | |
$text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g; | |
$text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g; | |
$text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g; | |
$text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g; | |
$text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g; | |
$text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g; | |
$text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g; | |
$text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g; | |
$text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g; | |
$text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g; | |
$text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g; | |
$text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g; | |
$text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g; | |
$text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g; | |
$text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g; | |
$text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g; | |
$text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g; | |
$text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g; | |
$text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g; | |
$text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g; | |
$text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g; | |
$text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g; | |
$text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g; | |
$text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g; | |
$text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g; | |
$text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g; | |
$text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g; | |
$text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g; | |
print STDERR "Text: $text0"; | |
for my $char (split(//,$text0)) { | |
$char = lc($char); | |
$char = "eu" if ($char =~ /öÖ/); # approximately :-) | |
$char = "ae" if ($char =~ /äÄ/); # could be better | |
$char = "uu" if ($char =~ /Üü/); # should also be for ë | |
if (defined $special{$char}) { | |
$text .= " " unless ($text =~ / $/); | |
$text .= $special{$char} . " "; | |
} elsif ($char =~ /[a-z]/) { | |
$text .= $char; | |
} else { | |
$text .= " " unless ($text =~ / $/); | |
} # simplify all whitespace/linebreak stretches | |
# and other special chars to single spaces | |
} | |
$text .= " " x 5; # end with spaces! | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# second step: apply phoneme pattern rules (prefer longest | |
# match, eat up all left side apart from trailing space, | |
# produce pure phoneme list) | |
my %five = ( " lijk"," l1k", | |
"lijk ",'l@k ', | |
"elijk","El1k", | |
" bent", " bEnt", | |
"atie ","atsi", #informatie | |
"+size+",5 | |
); | |
my %four = ( "http","ha te te pe ", | |
"html","ha te Em El ", | |
"agen","axEn", | |
"ooie","oi@", # mooie | |
"ooit","oIt", | |
" er "," Er ", | |
" en "," En ", | |
" nl "," EnEl ", | |
" he "," hE ", | |
" ok "," oke ", | |
"hou ","h4w ", | |
"ouch","uS", # douche | |
"oush","uS", # kianoush | |
" pc "," pe se ", | |
"even",'ev@n', | |
"tie ","tsi", #vakantie | |
" chi"," Si", # china | |
"+size+",4 | |
); | |
my %three = ( | |
"aai","5" , | |
"ooi","oi" , | |
"oei","7", | |
"cee","se", | |
"ai ","8" , | |
"oi ","9", | |
"age","aZe", | |
"ch ","x" , | |
"ftp","ef te pe ", | |
"www","we we we ", | |
"htm","ha te em ", | |
"tp:", "te pe ", | |
"mp ", "Em pe ", # mp3 | |
"mb ", "Embe ", # mp3 | |
"eeu", "e2", | |
"en ",'@n', | |
"he ","he", | |
"eij","1", | |
#pronounciation of E before double dissonant | |
"ett","Et", #letter | |
"epp","Ep", | |
"ett","Et", | |
"err","Er", | |
"ekk","Ek", #lekker | |
"emm","Em", | |
"ess","Es", | |
"eff","Ef", | |
"ell","El", | |
"ebb","Eb", | |
"enn","En", | |
#Marc> distinct letters: k.n.m.i a.u.b. | |
" a ", "a", | |
" b ", "be", | |
" c ", "se", | |
" d ", "de", | |
" e ", "e", | |
" f ", "Ef", | |
" g ", "xe", | |
" h ", "ha", | |
" i ", "i", | |
" j ", "ie", | |
" k ", "ka", | |
" l ", "El", | |
" m ", "Em", | |
" n ", "En", | |
" o ", "o", | |
" p ", "pe", | |
" q ", "ky", | |
" r ", "Er", | |
" s ", "Es", | |
" t ", "te", | |
" u ", "y", | |
" v ", "ve", | |
" w ", "we", | |
" x ", "Iks", | |
" y ", "1", | |
" z ", "zEt", | |
"+size+",3 | |
); | |
my %two = ("ie","i" , "oe","u" , "uu","y", | |
"aa","a" , "ee","e" , "oo","o", | |
"eu","2" , "ei","1", | |
"ui","3" , "ou","4" , "ij","1", | |
"sj","S" , "g ","x" , "nj","J", | |
"ce","sE", | |
"l ","l" , "ng","N" , | |
"dt","t" , "ch","x" , "iu","ju", | |
"dl",'d@l', "lf",'l@f', | |
"bb","b" , "dd","d" , "e ",'@', | |
"d ","t" , "hr","r" , "hl","l", | |
"o ","o" , "a ", "a", | |
"yl","1l", | |
"zl","z l", | |
"mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n. | |
"pp", "p", | |
"nn", "n", | |
"rr", "r", | |
"kk", "k", | |
"tt", "t", | |
"+size+",2 | |
); # hr/hl/yl/zl: sane processing | |
# of foreign words | |
my %one = ("a","A", "b","b", "c","k", | |
"d","d", "e",'E', "f","f", | |
"g","x", "h","h", "i","I", | |
"j","j", "k","k", "l","l", | |
"m","m", "n","n", "o","O", | |
"p","p", "q","k", "r","r", | |
"s","s", "t","t", "u","Y", | |
"v","v", "w","w", "x","ks", | |
"y","j", "z","z", " "," ", | |
".",".", "?","?", ",",",", | |
"+size+",1 | |
); # prosody with [ ?.,] is a later step | |
my @todo = (\%five, \%four, \%three, \%two, \%one); | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
$phones = " "; | |
my $x = 0; # string index | |
while ($x < (length($text)-5)) { | |
my $y = 0; | |
for my $hashref (@todo) { # do l-longest rules first... | |
if ($y != 0) { next; } | |
my $check = substr($text,$x,$hashref->{"+size+"}); | |
if (defined $hashref->{$check}) { | |
$phones .= $hashref->{$check}; | |
$x += $hashref->{"+size+"}; | |
$x-- if (($check =~ / $/) && ($check ne " ")); | |
# skip over matched part, but rewind on " " suffix | |
$y++; | |
print STDERR "Translate: <$check> to /" | |
. $hashref->{$check} . "/\n" | |
if (length($check) >= $XLATEDEBUG); | |
} | |
} | |
if ($y == 0) { | |
print STDERR "Had to translate first char to NIL:\n"; | |
print STDERR "<" . substr($text,$x,10) . "...>\n"; | |
$phones .= " "; | |
$x++; | |
} | |
} | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# third step: convert to SAMPA alphabet and apply constraints | |
# on phoneme pairings (input: $phones string) | |
my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai", | |
"6","oi", "7","ui", "8","Ai", "9","Oi", | |
" ","_", "?","_", ".","_", ",","_" | |
); | |
my $Pvowel = "aeiouAEIOy2Y13456789"; | |
my $Pdipht = "56789"; | |
my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience | |
my $Pconson = "ptkbdgcfvszSZxGhmn"; | |
my $Pvoice2 = "czZGhJj"; | |
my $Psemi = "GNJL"; | |
my $Pspace = ".?,_ "; | |
# rules: | |
# handled above: no "EY" or "IY" (replace by eY and iY) | |
# handled above: no d before l (add schwa) | |
# handled above: common case of bb and dd (replace by b and d) | |
# handled above: commod case of d_ (devoice to t_) | |
# no voiced/semi doubled (replace by single occurance) | |
# no schwa before OR AFTER dipht (remove schwa) | |
# no voice2 before l, r or j (add schwa) | |
# no voiced before conson (add schwa ; duplication rule first) | |
# no conson before semi (add schwa) | |
# special case of next rule: j-E (replace by j-@) | |
# no dipht before or after vowel/j (insert " ", see above) | |
# no voiced at the end of a word (devoice) | |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
$text = ""; | |
my $adder = ""; # buffer before we really add the phone! | |
my $freq = 200; # freq in Hz, only used at " " for now | |
my $dur = 100; # duration in msec | |
my $ph; # current phoneme | |
my $ph0 = " "; # previous phoneme | |
# the prosody and rhythm are still extremely simple | |
foreach $ph (split(//,$phones)) { | |
if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) { | |
print STDERR "${ph}-$ph: removeone $ph\n"; | |
$adder = ""; # ignore first copy of double phoneme | |
} elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) { | |
# remove the previous schwa | |
# (or just insert a short "h") | |
$adder = ""; | |
print STDERR "\@-$ph: remove \@\n"; | |
} elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) { | |
# remove the current schwa | |
$ph = ""; | |
print STDERR "${ph0}-\@: remove \@\n"; | |
} elsif ( (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/)) | |
|| (($ph0 =~ /[${Pvoiced}]/) && | |
($ph =~ /[${Pconson}]/)) | |
|| (($ph0 =~ /[${Pconson}]/) && | |
($ph =~ /[${Psemi}]/)) | |
) { | |
$adder .= "\@ 50\n"; | |
print STDERR "${ph0}-$ph: insert schwa\n"; | |
} elsif (($ph0 eq "j") && ($ph eq "E")) { | |
print STDERR "j-E: changeto j-\@\n"; | |
$ph = "@"; # modify this part this time... | |
} elsif ( (($ph0 =~ /[${Pdipht}j]/) && | |
($ph =~ /[${Pvowel}j]/)) | |
|| (($ph0 =~ /[${Pvowel}j]/) && | |
($ph =~ /[${Pdipht}j]/)) | |
) { | |
if ($ph0 eq "j") { | |
$adder = "i 100\n"; | |
print STDERR "${ph0}-$ph: changeto i-$ph\n"; | |
} | |
if ($ph eq "j") { | |
print STDERR "${ph0}-$ph: changeto ${ph0}-i\n"; | |
$ph = "i"; | |
} | |
if (($ph0 ne "j") && ($ph ne "j")) { | |
$adder .= "_ 50\n"; | |
print STDERR "${ph0}-$ph: insert break\n"; | |
} | |
} elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) { | |
my $de = $ph0; | |
$de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/; | |
# ptxfsSx IIk | |
print STDERR "${ph0}-_: changeto ${de}-_\n"; | |
$adder = "$de 100\n"; | |
} | |
if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) { | |
print STDERR "j-_: insert \@\n"; | |
$adder .= "\@ 100\n"; | |
} | |
if ($adder) { | |
$text .= $adder; # add possibly corrected recent phoneme | |
$adder =~ s{^([^ ]*).*$} | |
{$1}gm; # reduce to phonemes, multiline | |
die "<$adder> ?\n" if ($adder =~ / /); | |
$adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b- | |
print STRING "${adder}-"; | |
} | |
if ($ph) { | |
$dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100; | |
# longer for long vowels/rmnNJ | |
$freq = 200 if ($ph eq " "); # default freq | |
$freq = 252 if ($ph eq "?"); # go up for questions | |
$freq = 159 if ($ph eq "."); # go down for boundaries | |
$freq = 178 if ($ph eq ","); # go down a bit for commas | |
if ($ph =~ /[${Pspace}]/) { # various breaks | |
$adder = "_ 100 (50 , $freq)\n"; | |
} else { | |
$adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph ); | |
# use 1..2 char phone names | |
$adder .= " $dur\n"; | |
} | |
} else { | |
print STDERR "Skip\n"; | |
$adder = ""; | |
} | |
$ph0 = $ph; | |
} | |
print "$text\n"; | |
print STRING "\n"; | |
close STRING; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment