Created
June 12, 2024 08:43
-
-
Save huangruizhe/dd75cf44bde12751500b8c43c73f3f22 to your computer and use it in GitHub Desktop.
A simple rule based english text perturbator that preserves the pronunciation similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TextPerturbator: | |
def __init__(self): | |
rules = """ | |
a aa | |
aa ar | |
a e | |
a o | |
a ei | |
a ay | |
a ey | |
ay ey | |
ai ay | |
ai ei | |
an en | |
an in | |
an on | |
ar er | |
ar er | |
ar or | |
ar our | |
ar ur | |
au oo | |
au u | |
au o | |
au aw | |
as es | |
at ad | |
al el | |
b p | |
b bh | |
c ck | |
c k | |
c s | |
ce se | |
ch ck | |
ch j | |
ch sh | |
ch tch | |
con com | |
d dh | |
t dh | |
d t | |
d tt | |
d dd | |
di dee | |
dis this | |
ear eer | |
em en | |
en an | |
e io | |
e eo | |
e eu | |
ew eu | |
aw au | |
ee i | |
ee ea | |
ee y | |
er ar | |
er ir | |
er ir | |
er ur | |
er or | |
es is | |
ew oo | |
ew io | |
ew iu | |
el ol | |
f ff | |
f ph | |
f v | |
f p | |
g gg | |
h hh | |
i e | |
i ea | |
i ie | |
i ey | |
ie ee | |
ie y | |
io eo | |
igh ie | |
igh y | |
ii i | |
in an | |
in een | |
in en | |
in en | |
in ing | |
ir ur | |
j g | |
j dj | |
k ck | |
la le | |
ll l | |
l r | |
mm m | |
_n _kn | |
nn n | |
o oe | |
o oe | |
o ow | |
o ol | |
o ar | |
al ol | |
el ol | |
al el | |
oi oy | |
oo ew | |
oo ou | |
oo u | |
oo ue | |
or ar | |
or er | |
or ow | |
or ur | |
ou ow | |
ough uff | |
ow oa | |
ow ou | |
p b | |
p pp | |
ph f | |
q k | |
q ck | |
que_ k_ | |
que_ ck_ | |
r rh | |
r hr | |
re ri | |
s th | |
s ts | |
s tz | |
s z | |
s ss | |
s dz | |
sc xg | |
sa se | |
sh ch | |
sh j | |
stle_ so_ | |
t d | |
t tt | |
th d | |
th f | |
th s | |
th v | |
th z | |
tch ch | |
tch sh | |
tion shion | |
tion sion | |
tr dr | |
ue oo | |
ur ir | |
u iu | |
v w | |
w wh | |
x ks | |
x s | |
z ts | |
z tz | |
""" | |
rules = rules.upper().strip().split("\n") # upper case for librispeech | |
rules = [rule.strip() for rule in rules] | |
rules_dict = defaultdict(set) | |
for r in rules: | |
rules_dict[r.split()[0]].add(r.split()[1]) | |
rules_dict[r.split()[1]].add(r.split()[0]) | |
self.rules_dict = {k: list(v) for k, v in rules_dict.items()} | |
def perturb_one_word(self, s): | |
s = f"_{s}_" | |
appearances = [sub for sub in self.rules_dict.keys() if sub in s] | |
if len(appearances) == 0: | |
return s | |
pattern = random.choice(appearances) | |
target = random.choice(self.rules_dict[pattern]) | |
# n_occur = len(re.findall(f'(?={pattern})', s)) | |
# random.randint(0, n_occur-1) | |
s = s.replace(pattern, target, 1) | |
s = s[1:-1] | |
return s | |
def perturb_texts(self, texts, common_words=[], prob=0.6) -> str: | |
# p = np.random.rand(len(texts)) < prob | |
all_rare_words = list() | |
for text in texts: | |
rare_words = [word for word in text.split() if word not in common_words] | |
all_rare_words.append(rare_words) | |
_all_rare_words = [w for rare_words in all_rare_words for w in rare_words if random.random() < prob] | |
_all_rare_words_proxies = [self.perturb_one_word(w) for w in _all_rare_words] | |
_all_rare_words = {w1: w2 for w1, w2 in zip(_all_rare_words, _all_rare_words_proxies)} # a mapping of old => new | |
new_texts = list() | |
new_rare_words = list() | |
for text in texts: | |
text_split = text.split() | |
new_texts.append(" ".join([_all_rare_words.get(w, w) for w in text_split])) | |
new_rare_words.append([_all_rare_words[w] for w in text_split if w in _all_rare_words]) | |
return new_texts, new_rare_words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment