clojens · November 22, 2014 02:14
diff --git a/nl2pho2.pl b/nl2pho2.pl
 #!/usr/bin/perl -w
 use English; # things like MATCH

 # --------------------------------------------------------------

 # txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
 # the generated phoneme file is suitable for use with MBROLA,
 # but you have to use the -e option in MBROLA to skip over
 # spurious unpronounceable phoneme pairs (e.g. caused by English
 # words in your Dutch text file!).
 # Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"

 # This is GPLed software (open source freeware) by
 # Eric Auer <[email protected]>, the license is the GNU GPL
 # version 2 or later, also available as copying.txt in this
 # directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)

 # Please give me some feedback: As I am no native speaker
 # of Dutch, this txt2phoNL definitely need some improvement!

 # --------------------------------------------------------------

 # new version 14 feb 2002:
 # - sanitize away illegal phone pairs in a last step,
 #   includes devoicing of consonants before a break.
 # - intermediate repn uses one char per phoneme.
 # - simpler rewrite mechanism eats all matched chars
 #   and produces only phones - so the text string is constant.
 #   BUT: restart from " " if the rule input ended in " " !
 # - steps: 1. digit/... names
 #          2. sound pattern rules (preferring long matches,
 #             walking the string and trying all rules per char)
 #          3. sanitize and get final repn from intermediate one

 # new version 2003-04-05 by Marc Spoorendonk [email protected] (native Dutch speaker)
 # - changed to much to mention. Very acceptable translation now.

 my $XLATEDEBUG = 4; # show all translation rule applications
                    # of at least this size

 # special one char repn:
 # _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
 # oi is  6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
 # , is comma
 
 open(STRING,">/dev/stderr") || die "cannot open debug log\n";
 # open(STRING,">nl2pho.log") || die "cannot open debug log\n";
 my $foo;

 $OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
                       # write/print, do not buffer output
 $/ = undef;       # do not split on line breaks
                  # $/ is $RS, record separator in use English
 my $text0 = <STDIN>;   # read stdin
 my $text = " ";  # other stage (start with a space)
 my $phones = " "; # phoneme one-char-per-phoneme repn

 # by the way: a "^>*" remover would be nice for mails...


 # g vs G vs x: regen [reG@n] goal [goL]  gage [xaZe]
 # where the G (voiced "ch") is a dialect alternative to x ("ch")
 # and the g only occurs in foreign words.

 # e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver]  pet [pEt]
 # this is the len: e is long, is ee or e-at-end-of-syll.
 # E is short, is default, kind of.


 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # first step: reduce the alphabet by spelling out specials
 # result: a plain [a-z.? ]* string

 my %special = ("0","null",   "=","is",
               "1","een",    "!","!",
               "2","twee",   '"',"aanhaalingsteken",
               "3","drie",
               "4","vier",   "\$","dollar",
               "5","vijf",   "%","procent",
               "6","zes",    "&","en",
               "7","zeven",  "/","slesh", #phonetically
               "8","acht",   "(","haakje openen",
               "9","negen",  ")","haakje sluiten,",
               "*","ster",   "\\","beckslesh", #phonetically
               "+","plus",   "?","?",
               "#","hekje",  "|","paip", #phonetically
               ".",".",      "_","underscoor", #phonetically
               ",",",",      "-","",
               ">","groter", ";",";",
               "<","kleiner",":",":",
               "^","dakje",  "@","aapestaartje",
               "°","grad",   "{","accolade openen",
               "[","hoekje", "]","hoekje sluiten,",
               "~","tilde",  "}","accolade sluiten,"
              );

 # use this: punt. koma, vraagteken?
 #  or that: .     ,     ?
 # the latter has the problem that a . or , or ?
 # surrounded by spaces just sounds like a space...

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 #Marc> Prefix with space for easyer matching.
 $text0 =~ s/^/ /g; 
 $text0 =~ s/$/ /g; 

 $text0 =~ s/^[>]*//g; # un-mailify the text :-)
 $text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
 $text0 =~ s{:-[)]}{,lachend gezicht}g;    # smiley
 $text0 =~ s{:[)]}{,lachend gezicht}g;    # smiley
 $text0 =~ s{:-[(]}{,treurig gezicht}g;    # smiley
 $text0 =~ s{:[(]}{,treurig gezicht}g;    # smiley
 $text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
 $text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley

 $text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie

 #Marc>  betaal -> betaal
 #Marc>  betalen -> betaalen
 #Marc> It keeps metten, marren, matten as they are.
 #Marc> betaling -> betaaling 
 #                b      e           t             a          l             i ng bet a a li ng
 $text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;

 #Marc>  meten -> meeten    maren -> maaren
 #                m       e             t           e        n     m e e ten
 $text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;

 #Marc> k.n.m.i. -> k n m i
 $text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
 $text0 =~ s/([^a-z])([a-z])\./$1$2 /g;

 #Marc> remove lines from input: "-----------------------------" -> "" 
 $text0 =~ s/[-_=+]{3,}//g;

 #Marc> www.bla.com -> www punt bla punt com
 $text0 =~ s/\.([^ \n\t])/punt $1/g;

 #Marc> translate some numbers. (write a function for this once)
 $text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
 $text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
 $text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
 $text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
 $text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
 $text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
 $text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
 $text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
 $text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
 $text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
 $text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
 $text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
 $text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
 $text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
 $text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
 $text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
 $text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
 $text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
 $text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
 $text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
 $text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
 $text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
 $text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
 $text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
 $text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
 $text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
 $text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
 $text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
 $text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
 $text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;

 print STDERR "Text: $text0";


 for my $char (split(//,$text0)) {
  $char = lc($char);
  $char = "eu" if ($char =~ /öÖ/); # approximately :-)
  $char = "ae" if ($char =~ /äÄ/); # could be better
  $char = "uu" if ($char =~ /Üü/); # should also be for &euml;
  if (defined $special{$char}) {
    $text .= " " unless ($text =~ / $/);
    $text .= $special{$char} . " ";
  } elsif ($char =~ /[a-z]/) {
    $text .= $char;
  } else {
    $text .= " " unless ($text =~ / $/);
  }   # simplify all whitespace/linebreak stretches
      # and other special chars to single spaces
 }
 $text .= " " x 5; # end with spaces!

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # second step: apply phoneme pattern rules (prefer longest
 # match, eat up all left side apart from trailing space,
 # produce pure phoneme list)

 my %five  = (	" lijk"," l1k", 
 		"lijk ",'l@k ',
             	"elijk","El1k",
 		" bent", " bEnt",
 		"atie ","atsi",		#informatie
             	"+size+",5
            );

 my %four  = (	"http","ha te te pe ",
             	"html","ha te Em El ",
 		"agen","axEn",
             	"ooie","oi@",	# mooie
 		"ooit","oIt",
 		" er "," Er ",
 		" en "," En ",
 		" nl "," EnEl ",
 		" he "," hE ",
 		" ok "," oke ",
             	"hou ","h4w ",
 		"ouch","uS",	# douche
 		"oush","uS",	# kianoush
 		" pc "," pe se ",
             	"even",'ev@n',
 		"tie ","tsi",	#vakantie
 		" chi"," Si",	# china
             	"+size+",4
            );

 my %three = (	

 		"aai","5" ,
 		"ooi","oi" ,
 		"oei","7",
 		"cee","se",
             	"ai ","8" ,
 		"oi ","9",
 		"age","aZe",
             	"ch ","x" ,
 		"ftp","ef te pe ",
             	"www","we we we ",
             	"htm","ha te em ",
             	"tp:", "te pe ",
 		"mp ", "Em pe ", # mp3
 		"mb ", "Embe ", # mp3
             	"eeu", "e2",
 		"en ",'@n',
 		"he ","he",
 		"eij","1",
 #pronounciation of E before double dissonant
 		"ett","Et", #letter
 		"epp","Ep",
 		"ett","Et",
 		"err","Er",
 		"ekk","Ek", #lekker
 		"emm","Em",
 		"ess","Es",
 		"eff","Ef",
 		"ell","El",
 		"ebb","Eb",
 		"enn","En",
 #Marc> distinct letters:  k.n.m.i   a.u.b.
 		" a ", "a",
 		" b ", "be",
 		" c ", "se",
 		" d ", "de",
 		" e ", "e",
 		" f ", "Ef",
 		" g ", "xe",
 		" h ", "ha",
 		" i ", "i",
 		" j ", "ie",
 		" k ", "ka",
 		" l ", "El",
 		" m ", "Em",
 		" n ", "En",
 		" o ", "o",
 		" p ", "pe",
 		" q ", "ky",
 		" r ", "Er",
 		" s ", "Es",
 		" t ", "te",
 		" u ", "y",
 		" v ", "ve",
 		" w ", "we",
 		" x ", "Iks",
 		" y ", "1",
 		" z ", "zEt",
             	"+size+",3
            ); 

 my %two   = ("ie","i" , "oe","u" , "uu","y",
             "aa","a" , "ee","e" , "oo","o",
             "eu","2" ,            "ei","1",
             "ui","3" , "ou","4" , "ij","1",
             "sj","S" , "g ","x" , "nj","J",
 	     "ce","sE",
             "l ","l" , "ng","N" ,
             "dt","t" , "ch","x" , "iu","ju",
             "dl",'d@l',           "lf",'l@f',
             "bb","b" , "dd","d" , "e ",'@',
             "d ","t" , "hr","r" , "hl","l",
             "o ","o" , "a ", "a",
 	     "yl","1l",
             "zl","z l",
 	     "mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
 	     "pp", "p",
 	     "nn", "n",
 	     "rr", "r",
 	     "kk", "k",
 	     "tt", "t",
             "+size+",2
            ); # hr/hl/yl/zl: sane processing
               # of foreign words

 my %one   = ("a","A", "b","b", "c","k",
             "d","d", "e",'E', "f","f",
             "g","x", "h","h", "i","I",
             "j","j", "k","k", "l","l",
             "m","m", "n","n", "o","O",
             "p","p", "q","k", "r","r",
             "s","s", "t","t", "u","Y",
             "v","v", "w","w", "x","ks",
             "y","j", "z","z", " "," ",
             ".",".", "?","?", ",",",",
             "+size+",1
            ); # prosody with [ ?.,] is a later step

 my @todo = (\%five, \%four, \%three, \%two, \%one);

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 $phones = " ";
 my $x = 0;     # string index
 while ($x < (length($text)-5)) {
  my $y = 0;
  for my $hashref (@todo) {    # do l-longest rules first...
    if ($y != 0) { next; }
    my $check = substr($text,$x,$hashref->{"+size+"});
    if (defined $hashref->{$check}) {
      $phones .= $hashref->{$check};
      $x += $hashref->{"+size+"};
      $x-- if (($check =~ / $/) && ($check ne " "));
      # skip over matched part, but rewind on " " suffix
      $y++;
      print STDERR "Translate: <$check> to /"
        .   $hashref->{$check} . "/\n"
        if  (length($check) >= $XLATEDEBUG);
    }
  }
  if ($y == 0) {
    print STDERR "Had to translate first char to NIL:\n";
    print STDERR "<" . substr($text,$x,10) . "...>\n";
    $phones .= " ";
    $x++;
  }
 }

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # third step: convert to SAMPA alphabet and apply constraints
 # on phoneme pairings (input: $phones string)

 my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
             "6","oi", "7","ui", "8","Ai", "9","Oi",
             " ","_",  "?","_",  ".","_",  ",","_"
            );

 my $Pvowel  = "aeiouAEIOy2Y13456789";
 my $Pdipht  = "56789";
 my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
 my $Pconson = "ptkbdgcfvszSZxGhmn";
 my $Pvoice2 = "czZGhJj";
 my $Psemi   = "GNJL";
 my $Pspace  = ".?,_ ";

 # rules:

 # handled above: no "EY" or "IY" (replace by eY and iY)
 # handled above: no d before l (add schwa)
 # handled above: common case of bb and dd (replace by b and d)
 # handled above: commod case of d_ (devoice to t_)

 # no voiced/semi doubled (replace by single occurance)
 # no schwa   before OR AFTER dipht (remove schwa)
 # no voice2  before l, r or j (add schwa)
 # no voiced  before conson (add schwa ; duplication rule first)
 # no conson  before semi (add schwa)
 # special case of next rule: j-E (replace by j-@)
 # no dipht before or after vowel/j (insert " ", see above)
 # no voiced  at the end of a word (devoice)

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 $text = "";
 my $adder = ""; # buffer before we really add the phone!
 my $freq = 200; # freq in Hz, only used at " " for now
 my $dur = 100;  # duration in msec
 my $ph;         # current phoneme
 my $ph0 = " ";  # previous phoneme
 # the prosody and rhythm are still extremely simple

 foreach $ph (split(//,$phones)) {

  if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
    print STDERR "${ph}-$ph: removeone $ph\n";
    $adder = ""; # ignore first copy of double phoneme
  } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
    # remove the previous schwa
    # (or just insert a short "h")
    $adder = "";
    print STDERR "\@-$ph: remove \@\n";
  } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
    # remove the current schwa
    $ph = "";
    print STDERR "${ph0}-\@: remove \@\n";
  } elsif (  (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
          || (($ph0 =~ /[${Pvoiced}]/) && 
              ($ph  =~ /[${Pconson}]/))
          || (($ph0 =~ /[${Pconson}]/) &&
              ($ph  =~ /[${Psemi}]/))
          ) {
    $adder .= "\@ 50\n";
    print STDERR "${ph0}-$ph: insert schwa\n";
  } elsif (($ph0 eq "j") && ($ph eq "E")) {
    print STDERR "j-E: changeto j-\@\n";
    $ph = "@"; # modify this part this time...
  } elsif (  (($ph0 =~ /[${Pdipht}j]/) && 
              ($ph  =~ /[${Pvowel}j]/))
          || (($ph0 =~ /[${Pvowel}j]/) && 
              ($ph  =~ /[${Pdipht}j]/))
          ) {
    if ($ph0 eq "j") {
      $adder = "i 100\n";
      print STDERR "${ph0}-$ph: changeto i-$ph\n";
    }
    if ($ph eq "j") {
      print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
      $ph = "i";
    }
    if (($ph0 ne "j") && ($ph ne "j")) {
      $adder .= "_ 50\n";
      print STDERR "${ph0}-$ph: insert break\n";
    }
  } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
    my $de = $ph0;
    $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
          #  ptxfsSx IIk
    print STDERR "${ph0}-_: changeto ${de}-_\n";
    $adder = "$de 100\n";
  }

  if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
    print STDERR "j-_: insert \@\n";
    $adder .= "\@ 100\n";
  }

  if ($adder) {
    $text .= $adder; # add possibly corrected recent phoneme
    $adder =~ s{^([^ ]*).*$}
               {$1}gm;   # reduce to phonemes, multiline
    die "<$adder> ?\n" if ($adder =~ / /);
    $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
    print STRING "${adder}-";
  }

  if ($ph) {
    $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
                                  # longer for long vowels/rmnNJ
    $freq = 200 if ($ph eq " ");  # default freq
    $freq = 252 if ($ph eq "?");  # go up for questions
    $freq = 159 if ($ph eq ".");  # go down for boundaries
    $freq = 178 if ($ph eq ",");  # go down a bit for commas
    if ($ph =~ /[${Pspace}]/) {   # various breaks
      $adder = "_ 100 (50 , $freq)\n";
    } else {
      $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
                                  # use 1..2 char phone names
      $adder .= " $dur\n";
    }
  } else {
    print STDERR "Skip\n";
    $adder = "";
  }

  $ph0 = $ph;

 }

 print "$text\n";

 print STRING "\n";
 close STRING;
	#!/usr/bin/perl -w
	use English; # things like MATCH

	# --------------------------------------------------------------

	# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
	# the generated phoneme file is suitable for use with MBROLA,
	# but you have to use the -e option in MBROLA to skip over
	# spurious unpronounceable phoneme pairs (e.g. caused by English
	# words in your Dutch text file!).
	# Hint: Use pipes, e.g. "ls \| txt2phoNL \| mbrola -e - - \| play"

	# This is GPLed software (open source freeware) by
	# Eric Auer <[email protected]>, the license is the GNU GPL
	# version 2 or later, also available as copying.txt in this
	# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)

	# Please give me some feedback: As I am no native speaker
	# of Dutch, this txt2phoNL definitely need some improvement!

	# --------------------------------------------------------------

	# new version 14 feb 2002:
	# - sanitize away illegal phone pairs in a last step,
	# includes devoicing of consonants before a break.
	# - intermediate repn uses one char per phoneme.
	# - simpler rewrite mechanism eats all matched chars
	# and produces only phones - so the text string is constant.
	# BUT: restart from " " if the rule input ended in " " !
	# - steps: 1. digit/... names
	# 2. sound pattern rules (preferring long matches,
	# walking the string and trying all rules per char)
	# 3. sanitize and get final repn from intermediate one

	# new version 2003-04-05 by Marc Spoorendonk [email protected] (native Dutch speaker)
	# - changed to much to mention. Very acceptable translation now.

	my $XLATEDEBUG = 4; # show all translation rule applications
	# of at least this size

	# special one char repn:
	# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
	# oi is 6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
	# , is comma

	open(STRING,">/dev/stderr") \|\| die "cannot open debug log\n";
	# open(STRING,">nl2pho.log") \|\| die "cannot open debug log\n";
	my $foo;

	$OUTPUT_AUTOFLUSH = 1; # (also known as $\|): flush after every
	# write/print, do not buffer output
	$/ = undef; # do not split on line breaks
	# $/ is $RS, record separator in use English
	my $text0 = <STDIN>; # read stdin
	my $text = " "; # other stage (start with a space)
	my $phones = " "; # phoneme one-char-per-phoneme repn

	# by the way: a "^>*" remover would be nice for mails...


	# g vs G vs x: regen [reG@n] goal [goL] gage [xaZe]
	# where the G (voiced "ch") is a dialect alternative to x ("ch")
	# and the g only occurs in foreign words.

	# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver] pet [pEt]
	# this is the len: e is long, is ee or e-at-end-of-syll.
	# E is short, is default, kind of.


	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# first step: reduce the alphabet by spelling out specials
	# result: a plain [a-z.? ]* string

	my %special = ("0","null", "=","is",
	"1","een", "!","!",
	"2","twee", '"',"aanhaalingsteken",
	"3","drie",
	"4","vier", "\$","dollar",
	"5","vijf", "%","procent",
	"6","zes", "&","en",
	"7","zeven", "/","slesh", #phonetically
	"8","acht", "(","haakje openen",
	"9","negen", ")","haakje sluiten,",
	"*","ster", "\\","beckslesh", #phonetically
	"+","plus", "?","?",
	"#","hekje", "\|","paip", #phonetically
	".",".", "_","underscoor", #phonetically
	",",",", "-","",
	">","groter", ";",";",
	"<","kleiner",":",":",
	"^","dakje", "@","aapestaartje",
	"°","grad", "{","accolade openen",
	"[","hoekje", "]","hoekje sluiten,",
	"~","tilde", "}","accolade sluiten,"
	);

	# use this: punt. koma, vraagteken?
	# or that: . , ?
	# the latter has the problem that a . or , or ?
	# surrounded by spaces just sounds like a space...

	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	#Marc> Prefix with space for easyer matching.
	$text0 =~ s/^/ /g;
	$text0 =~ s/$/ /g;

	$text0 =~ s/^[>]*//g; # un-mailify the text :-)
	$text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
	$text0 =~ s{:-[)]}{,lachend gezicht}g; # smiley
	$text0 =~ s{:[)]}{,lachend gezicht}g; # smiley
	$text0 =~ s{:-[(]}{,treurig gezicht}g; # smiley
	$text0 =~ s{:[(]}{,treurig gezicht}g; # smiley
	$text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
	$text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley

	$text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie

	#Marc> betaal -> betaal
	#Marc> betalen -> betaalen
	#Marc> It keeps metten, marren, matten as they are.
	#Marc> betaling -> betaaling
	# b e t a l i ng bet a a li ng
	$text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;

	#Marc> meten -> meeten maren -> maaren
	# m e t e n m e e ten
	$text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;

	#Marc> k.n.m.i. -> k n m i
	$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
	$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;

	#Marc> remove lines from input: "-----------------------------" -> ""
	$text0 =~ s/[-_=+]{3,}//g;

	#Marc> www.bla.com -> www punt bla punt com
	$text0 =~ s/\.([^ \n\t])/punt $1/g;

	#Marc> translate some numbers. (write a function for this once)
	$text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
	$text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
	$text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
	$text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
	$text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
	$text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
	$text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
	$text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
	$text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
	$text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
	$text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
	$text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
	$text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
	$text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
	$text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
	$text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
	$text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
	$text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
	$text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
	$text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
	$text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
	$text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
	$text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
	$text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
	$text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
	$text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
	$text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
	$text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
	$text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
	$text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;

	print STDERR "Text: $text0";


	for my $char (split(//,$text0)) {
	$char = lc($char);
	$char = "eu" if ($char =~ /öÖ/); # approximately :-)
	$char = "ae" if ($char =~ /äÄ/); # could be better
	$char = "uu" if ($char =~ /Üü/); # should also be for ë
	if (defined $special{$char}) {
	$text .= " " unless ($text =~ / $/);
	$text .= $special{$char} . " ";
	} elsif ($char =~ /[a-z]/) {
	$text .= $char;
	} else {
	$text .= " " unless ($text =~ / $/);
	} # simplify all whitespace/linebreak stretches
	# and other special chars to single spaces
	}
	$text .= " " x 5; # end with spaces!

	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# second step: apply phoneme pattern rules (prefer longest
	# match, eat up all left side apart from trailing space,
	# produce pure phoneme list)

	my %five = ( " lijk"," l1k",
	"lijk ",'l@k ',
	"elijk","El1k",
	" bent", " bEnt",
	"atie ","atsi", #informatie
	"+size+",5
	);

	my %four = ( "http","ha te te pe ",
	"html","ha te Em El ",
	"agen","axEn",
	"ooie","oi@", # mooie
	"ooit","oIt",
	" er "," Er ",
	" en "," En ",
	" nl "," EnEl ",
	" he "," hE ",
	" ok "," oke ",
	"hou ","h4w ",
	"ouch","uS", # douche
	"oush","uS", # kianoush
	" pc "," pe se ",
	"even",'ev@n',
	"tie ","tsi", #vakantie
	" chi"," Si", # china
	"+size+",4
	);

	my %three = (

	"aai","5" ,
	"ooi","oi" ,
	"oei","7",
	"cee","se",
	"ai ","8" ,
	"oi ","9",
	"age","aZe",
	"ch ","x" ,
	"ftp","ef te pe ",
	"www","we we we ",
	"htm","ha te em ",
	"tp:", "te pe ",
	"mp ", "Em pe ", # mp3
	"mb ", "Embe ", # mp3
	"eeu", "e2",
	"en ",'@n',
	"he ","he",
	"eij","1",
	#pronounciation of E before double dissonant
	"ett","Et", #letter
	"epp","Ep",
	"ett","Et",
	"err","Er",
	"ekk","Ek", #lekker
	"emm","Em",
	"ess","Es",
	"eff","Ef",
	"ell","El",
	"ebb","Eb",
	"enn","En",
	#Marc> distinct letters: k.n.m.i a.u.b.
	" a ", "a",
	" b ", "be",
	" c ", "se",
	" d ", "de",
	" e ", "e",
	" f ", "Ef",
	" g ", "xe",
	" h ", "ha",
	" i ", "i",
	" j ", "ie",
	" k ", "ka",
	" l ", "El",
	" m ", "Em",
	" n ", "En",
	" o ", "o",
	" p ", "pe",
	" q ", "ky",
	" r ", "Er",
	" s ", "Es",
	" t ", "te",
	" u ", "y",
	" v ", "ve",
	" w ", "we",
	" x ", "Iks",
	" y ", "1",
	" z ", "zEt",
	"+size+",3
	);

	my %two = ("ie","i" , "oe","u" , "uu","y",
	"aa","a" , "ee","e" , "oo","o",
	"eu","2" , "ei","1",
	"ui","3" , "ou","4" , "ij","1",
	"sj","S" , "g ","x" , "nj","J",
	"ce","sE",
	"l ","l" , "ng","N" ,
	"dt","t" , "ch","x" , "iu","ju",
	"dl",'d@l', "lf",'l@f',
	"bb","b" , "dd","d" , "e ",'@',
	"d ","t" , "hr","r" , "hl","l",
	"o ","o" , "a ", "a",
	"yl","1l",
	"zl","z l",
	"mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
	"pp", "p",
	"nn", "n",
	"rr", "r",
	"kk", "k",
	"tt", "t",
	"+size+",2
	); # hr/hl/yl/zl: sane processing
	# of foreign words

	my %one = ("a","A", "b","b", "c","k",
	"d","d", "e",'E', "f","f",
	"g","x", "h","h", "i","I",
	"j","j", "k","k", "l","l",
	"m","m", "n","n", "o","O",
	"p","p", "q","k", "r","r",
	"s","s", "t","t", "u","Y",
	"v","v", "w","w", "x","ks",
	"y","j", "z","z", " "," ",
	".",".", "?","?", ",",",",
	"+size+",1
	); # prosody with [ ?.,] is a later step

	my @todo = (\%five, \%four, \%three, \%two, \%one);

	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	$phones = " ";
	my $x = 0; # string index
	while ($x < (length($text)-5)) {
	my $y = 0;
	for my $hashref (@todo) { # do l-longest rules first...
	if ($y != 0) { next; }
	my $check = substr($text,$x,$hashref->{"+size+"});
	if (defined $hashref->{$check}) {
	$phones .= $hashref->{$check};
	$x += $hashref->{"+size+"};
	$x-- if (($check =~ / $/) && ($check ne " "));
	# skip over matched part, but rewind on " " suffix
	$y++;
	print STDERR "Translate: <$check> to /"
	. $hashref->{$check} . "/\n"
	if (length($check) >= $XLATEDEBUG);
	}
	}
	if ($y == 0) {
	print STDERR "Had to translate first char to NIL:\n";
	print STDERR "<" . substr($text,$x,10) . "...>\n";
	$phones .= " ";
	$x++;
	}
	}

	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# third step: convert to SAMPA alphabet and apply constraints
	# on phoneme pairings (input: $phones string)

	my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
	"6","oi", "7","ui", "8","Ai", "9","Oi",
	" ","_", "?","_", ".","_", ",","_"
	);

	my $Pvowel = "aeiouAEIOy2Y13456789";
	my $Pdipht = "56789";
	my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
	my $Pconson = "ptkbdgcfvszSZxGhmn";
	my $Pvoice2 = "czZGhJj";
	my $Psemi = "GNJL";
	my $Pspace = ".?,_ ";

	# rules:

	# handled above: no "EY" or "IY" (replace by eY and iY)
	# handled above: no d before l (add schwa)
	# handled above: common case of bb and dd (replace by b and d)
	# handled above: commod case of d_ (devoice to t_)

	# no voiced/semi doubled (replace by single occurance)
	# no schwa before OR AFTER dipht (remove schwa)
	# no voice2 before l, r or j (add schwa)
	# no voiced before conson (add schwa ; duplication rule first)
	# no conson before semi (add schwa)
	# special case of next rule: j-E (replace by j-@)
	# no dipht before or after vowel/j (insert " ", see above)
	# no voiced at the end of a word (devoice)

	# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	$text = "";
	my $adder = ""; # buffer before we really add the phone!
	my $freq = 200; # freq in Hz, only used at " " for now
	my $dur = 100; # duration in msec
	my $ph; # current phoneme
	my $ph0 = " "; # previous phoneme
	# the prosody and rhythm are still extremely simple

	foreach $ph (split(//,$phones)) {

	if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
	print STDERR "${ph}-$ph: removeone $ph\n";
	$adder = ""; # ignore first copy of double phoneme
	} elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
	# remove the previous schwa
	# (or just insert a short "h")
	$adder = "";
	print STDERR "\@-$ph: remove \@\n";
	} elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
	# remove the current schwa
	$ph = "";
	print STDERR "${ph0}-\@: remove \@\n";
	} elsif ( (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
	\|\| (($ph0 =~ /[${Pvoiced}]/) &&
	($ph =~ /[${Pconson}]/))
	\|\| (($ph0 =~ /[${Pconson}]/) &&
	($ph =~ /[${Psemi}]/))
	) {
	$adder .= "\@ 50\n";
	print STDERR "${ph0}-$ph: insert schwa\n";
	} elsif (($ph0 eq "j") && ($ph eq "E")) {
	print STDERR "j-E: changeto j-\@\n";
	$ph = "@"; # modify this part this time...
	} elsif ( (($ph0 =~ /[${Pdipht}j]/) &&
	($ph =~ /[${Pvowel}j]/))
	\|\| (($ph0 =~ /[${Pvowel}j]/) &&
	($ph =~ /[${Pdipht}j]/))
	) {
	if ($ph0 eq "j") {
	$adder = "i 100\n";
	print STDERR "${ph0}-$ph: changeto i-$ph\n";
	}
	if ($ph eq "j") {
	print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
	$ph = "i";
	}
	if (($ph0 ne "j") && ($ph ne "j")) {
	$adder .= "_ 50\n";
	print STDERR "${ph0}-$ph: insert break\n";
	}
	} elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
	my $de = $ph0;
	$de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
	# ptxfsSx IIk
	print STDERR "${ph0}-_: changeto ${de}-_\n";
	$adder = "$de 100\n";
	}

	if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
	print STDERR "j-_: insert \@\n";
	$adder .= "\@ 100\n";
	}

	if ($adder) {
	$text .= $adder; # add possibly corrected recent phoneme
	$adder =~ s{^([^ ]).$}
	{$1}gm; # reduce to phonemes, multiline
	die "<$adder> ?\n" if ($adder =~ / /);
	$adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
	print STRING "${adder}-";
	}

	if ($ph) {
	$dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
	# longer for long vowels/rmnNJ
	$freq = 200 if ($ph eq " "); # default freq
	$freq = 252 if ($ph eq "?"); # go up for questions
	$freq = 159 if ($ph eq "."); # go down for boundaries
	$freq = 178 if ($ph eq ","); # go down a bit for commas
	if ($ph =~ /[${Pspace}]/) { # various breaks
	$adder = "_ 100 (50 , $freq)\n";
	} else {
	$adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
	# use 1..2 char phone names
	$adder .= " $dur\n";
	}
	} else {
	print STDERR "Skip\n";
	$adder = "";
	}

	$ph0 = $ph;

	}

	print "$text\n";

	print STRING "\n";
	close STRING;
No results found