Skip to content

Instantly share code, notes, and snippets.

@balachandrapai
Created March 9, 2018 06:21
Show Gist options
  • Save balachandrapai/7ecd80c5f088e5230dcda7d144858d1e to your computer and use it in GitHub Desktop.
Save balachandrapai/7ecd80c5f088e5230dcda7d144858d1e to your computer and use it in GitHub Desktop.
POS tags, Chunking and Chinking
##POS tagging is labeling words in a sentence as nouns, adjectives, verbs...etc
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
##PunktSentenceTokenizer a new sentence tokenizer
## This tokenizer is capable of unsupervised machine learning,
##so you can actually train it on any body of text that you use
##Creating training and testing data
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
##train the Punkt tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
## print(tagged)
##
## Chunking is done to extract meaningful
## Chunking on Adverbs, Noun (Singular) and Proper Noun
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
## print(chunked)
## chunked.draw()
## "chunked" variable is an NLTK tree
## Each "chunk" and "non chunk" is a "subtree" of the tree
## for subtree in chunked.subtrees():
## print(subtree)
## Print the subtree with label Chunk that we assigned above
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
print(subtree)
except Exception as e:
print(str(e))
process_content()
##Chinking is a lot like chunking, it is basically a way for you to remove a
##chunk from a chunk.
##The chunk that you remove from your chunk is your chink.
##chunkGram = r"""Chunk: {<.*>+}
## }<VB.?|IN|DT|TO>+{"""
##This means we're removing from the chink one or more
##verbs, prepositions, determiners, or the word 'to'.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment