Skip to content

Instantly share code, notes, and snippets.

@tungpun
Last active September 3, 2015 15:05
Show Gist options
  • Save tungpun/29660f3e9450be3d1b4f to your computer and use it in GitHub Desktop.
Save tungpun/29660f3e9450be3d1b4f to your computer and use it in GitHub Desktop.
NLP - first assignment
import re
def standalize(paragraph):
paragraph = paragraph.strip()
paragraph = re.sub(r'([,/;!?.(){}:=@#$%^&*\[\]<>\-\+])', r' \1 ', paragraph)
while paragraph.find(' ') != -1:
paragraph = paragraph.replace(' ', ' ')
return paragraph
def remove_space(slist):
newlist = []
for s in slist:
s = s.strip()
if s != '':
newlist.append(s.strip())
return newlist
if __name__ == '__main__':
paragraph = "mot doan van testing mau! Chi, (de) thu/nghiem.Have you ever met AP ? yes:) =)) !@#$%^&*(<>-+"
paragraph = standalize(paragraph)
print paragraph
sentences = remove_space(re.compile('[\.\!\?]').split(paragraph))
words = remove_space(re.compile('[\.\!\?\ ]').split(paragraph))
print sentences
print words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment