Skip to content

Instantly share code, notes, and snippets.

@huangruizhe
Created June 12, 2024 08:43
Show Gist options
  • Save huangruizhe/dd75cf44bde12751500b8c43c73f3f22 to your computer and use it in GitHub Desktop.
Save huangruizhe/dd75cf44bde12751500b8c43c73f3f22 to your computer and use it in GitHub Desktop.
A simple rule based english text perturbator that preserves the pronunciation similarity
class TextPerturbator:
def __init__(self):
rules = """
a aa
aa ar
a e
a o
a ei
a ay
a ey
ay ey
ai ay
ai ei
an en
an in
an on
ar er
ar er
ar or
ar our
ar ur
au oo
au u
au o
au aw
as es
at ad
al el
b p
b bh
c ck
c k
c s
ce se
ch ck
ch j
ch sh
ch tch
con com
d dh
t dh
d t
d tt
d dd
di dee
dis this
ear eer
em en
en an
e io
e eo
e eu
ew eu
aw au
ee i
ee ea
ee y
er ar
er ir
er ir
er ur
er or
es is
ew oo
ew io
ew iu
el ol
f ff
f ph
f v
f p
g gg
h hh
i e
i ea
i ie
i ey
ie ee
ie y
io eo
igh ie
igh y
ii i
in an
in een
in en
in en
in ing
ir ur
j g
j dj
k ck
la le
ll l
l r
mm m
_n _kn
nn n
o oe
o oe
o ow
o ol
o ar
al ol
el ol
al el
oi oy
oo ew
oo ou
oo u
oo ue
or ar
or er
or ow
or ur
ou ow
ough uff
ow oa
ow ou
p b
p pp
ph f
q k
q ck
que_ k_
que_ ck_
r rh
r hr
re ri
s th
s ts
s tz
s z
s ss
s dz
sc xg
sa se
sh ch
sh j
stle_ so_
t d
t tt
th d
th f
th s
th v
th z
tch ch
tch sh
tion shion
tion sion
tr dr
ue oo
ur ir
u iu
v w
w wh
x ks
x s
z ts
z tz
"""
rules = rules.upper().strip().split("\n") # upper case for librispeech
rules = [rule.strip() for rule in rules]
rules_dict = defaultdict(set)
for r in rules:
rules_dict[r.split()[0]].add(r.split()[1])
rules_dict[r.split()[1]].add(r.split()[0])
self.rules_dict = {k: list(v) for k, v in rules_dict.items()}
def perturb_one_word(self, s):
s = f"_{s}_"
appearances = [sub for sub in self.rules_dict.keys() if sub in s]
if len(appearances) == 0:
return s
pattern = random.choice(appearances)
target = random.choice(self.rules_dict[pattern])
# n_occur = len(re.findall(f'(?={pattern})', s))
# random.randint(0, n_occur-1)
s = s.replace(pattern, target, 1)
s = s[1:-1]
return s
def perturb_texts(self, texts, common_words=[], prob=0.6) -> str:
# p = np.random.rand(len(texts)) < prob
all_rare_words = list()
for text in texts:
rare_words = [word for word in text.split() if word not in common_words]
all_rare_words.append(rare_words)
_all_rare_words = [w for rare_words in all_rare_words for w in rare_words if random.random() < prob]
_all_rare_words_proxies = [self.perturb_one_word(w) for w in _all_rare_words]
_all_rare_words = {w1: w2 for w1, w2 in zip(_all_rare_words, _all_rare_words_proxies)} # a mapping of old => new
new_texts = list()
new_rare_words = list()
for text in texts:
text_split = text.split()
new_texts.append(" ".join([_all_rare_words.get(w, w) for w in text_split]))
new_rare_words.append([_all_rare_words[w] for w in text_split if w in _all_rare_words])
return new_texts, new_rare_words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment