Last active
May 22, 2018 16:07
-
-
Save nagadomi/2b8131ed5f50e375f306b146f8840d11 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
import re | |
class Sentence(): | |
class Phone(): | |
def __init__(self, begin_frame, end_frame, phone): | |
self.begin_frame = begin_frame | |
self.end_frame = end_frame | |
self.phone = phone | |
def is_silence(self): | |
return self.phone == "silE" or self.phone == "silB" or self.phone == "sp" or self.phone == "N" | |
def is_vowel(self): | |
return any(self.phone == v for v in ['a', 'i', 'u', 'e','o']) | |
def __str__(self): | |
return "{}-{}: {}".format(self.begin_frame, self.end_frame, self.phone) | |
@property | |
def begin_time(self): | |
t = 0 | |
if self.begin_frame != 0: | |
t += 0.0125 | |
return round(t + self.begin_frame * 0.01, 4) | |
@property | |
def end_time(self): | |
return round(0.0125 + (self.end_frame + 1) * 0.01, 4) | |
def __init__(self): | |
self.phones = [] | |
def __str__(self): | |
prev = None | |
s = [] | |
for v in self: | |
if v.is_silence(): | |
continue | |
if not v.is_vowel(): | |
if prev is None: | |
prev = v | |
else: | |
prev = Sentence.Phone(prev.begin_frame, v.end_frame, prev.phone + v.phone) | |
else: | |
if prev is not None: | |
s.append(prev.phone + v.phone) | |
prev = None | |
else: | |
s.append(v.phone) | |
prev = None | |
return "{}phoneme: {}".format(len(s), " ".join(s)) | |
def __iter__(self): | |
return iter(self.phones) | |
def add(self, begin_frame, end_frame, phone): | |
self.phones.append(self.Phone(begin_frame, end_frame, phone)) | |
def dump(self): | |
for v in self: | |
if not v.is_silence(): | |
print(str(m)) | |
def simplify(self): | |
# join continuous phone | |
clean_sentence = Sentence() | |
prev = None | |
for v in self.phones: | |
if not v.is_silence(): | |
if (prev is not None) and prev.phone == v.phone: | |
prev = Sentence.Phone(prev.begin_frame, v.end_frame, v.phone) | |
else: | |
if prev is not None: | |
clean_sentence.add(prev.begin_frame, prev.end_frame, prev.phone) | |
prev = v | |
if prev is not None and (len(clean_sentence.phones) == 0 or clean_sentence.phones[-1].begin_frame != prev.begin_frame): | |
clean_sentence.add(prev.begin_frame, prev.end_frame, prev.phone) | |
# convert to vowel only | |
prev = None | |
vowel_sentence = Sentence() | |
for v in self.phones: | |
if not v.is_vowel(): | |
if prev is None: | |
prev = v | |
else: | |
prev = Sentence.Phone(prev.begin_frame, v.end_frame, v.phone) | |
else: | |
if prev is not None: | |
vowel_sentence.add(prev.begin_frame, v.end_frame, v.phone) | |
prev = None | |
else: | |
vowel_sentence.add(v.begin_frame, v.end_frame, v.phone) | |
prev = None | |
return vowel_sentence | |
def wav2sentence(wav_file, julius_path="julius", dictation_kit_path="../dictation-kit/"): | |
command = [ | |
julius_path, | |
"-C", os.path.join(dictation_kit_path, "main.jconf"), | |
"-h", os.path.join(dictation_kit_path, os.path.join("model", "phone_m","jnas-mono-16mix-gid.binhmm")), | |
"-palign", "-fallback1pass", "-input", "rawfile"] | |
# run julius | |
p = subprocess.Popen(command, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) | |
p.stdin.write((wav_file + "\n").encode("UTF-8")) | |
p.stdin.close() | |
# parse | |
# e.g: b'[ 143 149] -24.958252 d\n' | |
# note that : is removed, e.g: "o:"->"o" | |
pt = re.compile(r"\[\s*(\d+)\s+(\d+)\s*\]\s*([\d\+\-\.]+)\s*([a-zA-Z]+)") | |
sentence = Sentence() | |
state = False | |
while True: | |
line = p.stdout.readline() | |
#print(line) | |
if not line: | |
break | |
if line.find(b"=== begin forced alignment ===") >= 0: | |
state = True | |
if line.find(b"=== end forced alignment ===") >= 0: | |
state = False | |
if state: | |
matched = pt.match(line.decode("utf-8")) | |
if matched: | |
begin_frame, end_frame, score, phone = matched.group(1),matched.group(2),matched.group(3), matched.group(4) | |
sentence.add(int(begin_frame), int(end_frame), phone) | |
p.wait() # wait for julius to exit | |
if p.returncode != 0: | |
raise RuntimeError("julius: an error occurred. please check julius_path and dictation_kit_path.") | |
return sentence | |
if __name__ =="__main__": | |
# run test | |
DICTATION_KIT_PATH="../dictation-kit" | |
JULIUS_PATH="julius" | |
INPUT_FILE="tts.wav" | |
sentence = wav2sentence(INPUT_FILE, julius_path=JULIUS_PATH, dictation_kit_path=DICTATION_KIT_PATH) | |
vowel_sentence = sentence.simplify() | |
print(str(sentence)) | |
print(str(vowel_sentence)) | |
for v in vowel_sentence: | |
print("{}-{} {}".format(v.begin_time, v.end_time, v.phone)) | |
""" | |
result: | |
43phoneme: o ha yo mi ke da yo te i tsu mo to ko e ga chi ga u kyo wa ma i ku ro so fu to ti di e su sa ga ka wa ri ni sha be qte i ma su | |
43phoneme: o a o i e a o e i u o o o e a i a u o a a i u o o u o i i e u a a a a i i a e e i a u | |
0.0-0.1425 o | |
0.1425-0.3125 a | |
0.3125-0.6725 o | |
0.6725-0.9425 i | |
0.9425-1.0625 e | |
1.0625-1.2125 a | |
1.2125-1.3825 o | |
1.3825-1.6825 e | |
1.6825-1.9225 i | |
1.9225-2.1025 u | |
2.1025-2.2425 o | |
2.2425-2.4225 o | |
2.4225-2.5825 o | |
2.5825-2.7025 e | |
2.7025-2.8425 a | |
2.8425-3.0225 i | |
3.0225-3.1925 a | |
3.1925-3.3525 u | |
3.3525-3.7725 o | |
3.7725-3.9825 a | |
3.9825-4.3125 a | |
4.3125-4.4425 i | |
4.4425-4.5425 u | |
4.5425-4.6825 o | |
4.6825-4.8825 o | |
4.8825-5.0025 u | |
5.0025-5.1325 o | |
5.1325-5.4425 i | |
5.4425-5.6825 i | |
5.6825-5.8125 e | |
5.8125-5.9525 u | |
5.9525-6.1425 a | |
6.1425-6.4225 a | |
6.4225-6.5925 a | |
6.5925-6.7025 a | |
6.7025-6.8425 i | |
6.8425-6.9725 i | |
6.9725-7.2125 a | |
7.2125-7.4225 e | |
7.4225-7.6125 e | |
7.6125-7.6725 i | |
7.6725-7.8325 a | |
7.8325-8.1125 u | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment