huangruizhe · June 12, 2024 08:43
diff --git a/english.py b/english.py
 class TextPerturbator:
    def __init__(self):
        rules = """
 a aa
 aa ar
 a e
 a o
 a ei
 a ay
 a ey
 ay ey
 ai ay
 ai ei
 an en
 an in
 an on
 ar er
 ar er
 ar or
 ar our
 ar ur
 au oo
 au u
 au o
 au aw
 as es
 at ad
 al el
 b p
 b bh
 c ck
 c k
 c s
 ce se
 ch ck
 ch j
 ch sh
 ch tch
 con com
 d dh
 t dh
 d t
 d tt
 d dd
 di dee
 dis this
 ear eer
 em en
 en an
 e io
 e eo
 e eu
 ew eu
 aw au
 ee i
 ee ea
 ee y
 er ar
 er ir
 er ir
 er ur
 er or
 es is
 ew oo
 ew io
 ew iu
 el ol
 f ff
 f ph
 f v
 f p
 g gg
 h hh
 i e
 i ea
 i ie
 i ey
 ie ee
 ie y
 io eo
 igh ie
 igh y
 ii i
 in an
 in een
 in en
 in en
 in ing
 ir ur
 j g
 j dj
 k ck
 la le
 ll l
 l r
 mm m
 _n _kn
 nn n
 o oe
 o oe
 o ow
 o ol
 o ar
 al ol
 el ol
 al el
 oi oy
 oo ew
 oo ou
 oo u
 oo ue
 or ar
 or er
 or ow
 or ur
 ou ow
 ough uff
 ow oa
 ow ou
 p b
 p pp
 ph f
 q k
 q ck
 que_ k_
 que_ ck_
 r rh
 r hr
 re ri
 s th
 s ts
 s tz
 s z
 s ss
 s dz
 sc xg
 sa se
 sh ch
 sh j
 stle_ so_
 t d
 t tt
 th d
 th f
 th s
 th v
 th z
 tch ch
 tch sh
 tion shion
 tion sion
 tr dr
 ue oo
 ur ir
 u iu
 v w
 w wh
 x ks
 x s
 z ts
 z tz
 """

        rules = rules.upper().strip().split("\n")  # upper case for librispeech
        rules = [rule.strip() for rule in rules]

        rules_dict = defaultdict(set)
        for r in rules:
            rules_dict[r.split()[0]].add(r.split()[1])
            rules_dict[r.split()[1]].add(r.split()[0])
        self.rules_dict = {k: list(v) for k, v in rules_dict.items()}

    def perturb_one_word(self, s):
        s = f"_{s}_"
        appearances = [sub for sub in self.rules_dict.keys() if sub in s]
        if len(appearances) == 0:
            return s
        pattern = random.choice(appearances)
        target = random.choice(self.rules_dict[pattern])
        # n_occur = len(re.findall(f'(?={pattern})', s))
        # random.randint(0, n_occur-1)
        s = s.replace(pattern, target, 1)
        s = s[1:-1]
        return s

    def perturb_texts(self, texts, common_words=[], prob=0.6) -> str:
        # p = np.random.rand(len(texts)) < prob

        all_rare_words = list()
        for text in texts:
            rare_words = [word for word in text.split() if word not in common_words]
            all_rare_words.append(rare_words)
        
        _all_rare_words = [w for rare_words in all_rare_words for w in rare_words if random.random() < prob]
        _all_rare_words_proxies = [self.perturb_one_word(w) for w in _all_rare_words]
        _all_rare_words = {w1: w2 for w1, w2 in zip(_all_rare_words, _all_rare_words_proxies)}  # a mapping of old => new

        new_texts = list()
        new_rare_words = list()
        for text in texts:
            text_split = text.split()
            new_texts.append(" ".join([_all_rare_words.get(w, w) for w in text_split]))
            new_rare_words.append([_all_rare_words[w] for w in text_split if w in _all_rare_words])

        return new_texts, new_rare_words
	class TextPerturbator:
	def __init__(self):
	rules = """
	a aa
	aa ar
	a e
	a o
	a ei
	a ay
	a ey
	ay ey
	ai ay
	ai ei
	an en
	an in
	an on
	ar er
	ar er
	ar or
	ar our
	ar ur
	au oo
	au u
	au o
	au aw
	as es
	at ad
	al el
	b p
	b bh
	c ck
	c k
	c s
	ce se
	ch ck
	ch j
	ch sh
	ch tch
	con com
	d dh
	t dh
	d t
	d tt
	d dd
	di dee
	dis this
	ear eer
	em en
	en an
	e io
	e eo
	e eu
	ew eu
	aw au
	ee i
	ee ea
	ee y
	er ar
	er ir
	er ir
	er ur
	er or
	es is
	ew oo
	ew io
	ew iu
	el ol
	f ff
	f ph
	f v
	f p
	g gg
	h hh
	i e
	i ea
	i ie
	i ey
	ie ee
	ie y
	io eo
	igh ie
	igh y
	ii i
	in an
	in een
	in en
	in en
	in ing
	ir ur
	j g
	j dj
	k ck
	la le
	ll l
	l r
	mm m
	_n _kn
	nn n
	o oe
	o oe
	o ow
	o ol
	o ar
	al ol
	el ol
	al el
	oi oy
	oo ew
	oo ou
	oo u
	oo ue
	or ar
	or er
	or ow
	or ur
	ou ow
	ough uff
	ow oa
	ow ou
	p b
	p pp
	ph f
	q k
	q ck
	que_ k_
	que_ ck_
	r rh
	r hr
	re ri
	s th
	s ts
	s tz
	s z
	s ss
	s dz
	sc xg
	sa se
	sh ch
	sh j
	stle_ so_
	t d
	t tt
	th d
	th f
	th s
	th v
	th z
	tch ch
	tch sh
	tion shion
	tion sion
	tr dr
	ue oo
	ur ir
	u iu
	v w
	w wh
	x ks
	x s
	z ts
	z tz
	"""

	rules = rules.upper().strip().split("\n") # upper case for librispeech
	rules = [rule.strip() for rule in rules]

	rules_dict = defaultdict(set)
	for r in rules:
	rules_dict[r.split()[0]].add(r.split()[1])
	rules_dict[r.split()[1]].add(r.split()[0])
	self.rules_dict = {k: list(v) for k, v in rules_dict.items()}

	def perturb_one_word(self, s):
	s = f"_{s}_"
	appearances = [sub for sub in self.rules_dict.keys() if sub in s]
	if len(appearances) == 0:
	return s
	pattern = random.choice(appearances)
	target = random.choice(self.rules_dict[pattern])
	# n_occur = len(re.findall(f'(?={pattern})', s))
	# random.randint(0, n_occur-1)
	s = s.replace(pattern, target, 1)
	s = s[1:-1]
	return s

	def perturb_texts(self, texts, common_words=[], prob=0.6) -> str:
	# p = np.random.rand(len(texts)) < prob

	all_rare_words = list()
	for text in texts:
	rare_words = [word for word in text.split() if word not in common_words]
	all_rare_words.append(rare_words)

	_all_rare_words = [w for rare_words in all_rare_words for w in rare_words if random.random() < prob]
	_all_rare_words_proxies = [self.perturb_one_word(w) for w in _all_rare_words]
	_all_rare_words = {w1: w2 for w1, w2 in zip(_all_rare_words, _all_rare_words_proxies)} # a mapping of old => new

	new_texts = list()
	new_rare_words = list()
	for text in texts:
	text_split = text.split()
	new_texts.append(" ".join([_all_rare_words.get(w, w) for w in text_split]))
	new_rare_words.append([_all_rare_words[w] for w in text_split if w in _all_rare_words])

	return new_texts, new_rare_words