Last active
January 25, 2017 13:21
-
-
Save yamanahlawat/49e9496f6abd55c853fdb845e46a0dc0 to your computer and use it in GitHub Desktop.
Remove repeating characters from a word using regular expressions and nltk wordnet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from nltk.corpus import wordnet | |
class RepeatReplacer(object): | |
def __init__(self): | |
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') | |
self.repl = r'\1\2\3' | |
def replace(self, word): | |
if wordnet.synsets(word): | |
return word | |
repl_word = self.repeat_regexp.sub(self.repl, word) | |
if repl_word != word: | |
return self.replace(repl_word) | |
else: | |
return repl_word | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment