balachandrapai · March 9, 2018 06:21
diff --git a/NLPBasics.py b/NLPBasics.py
 ##POS tagging is labeling words in a sentence as nouns, adjectives, verbs...etc
 import nltk
 from nltk.corpus import state_union
 from nltk.tokenize import PunktSentenceTokenizer

 ##PunktSentenceTokenizer a new sentence tokenizer
 ## This tokenizer is capable of unsupervised machine learning,
 ##so you can actually train it on any body of text that you use

 ##Creating training and testing data
 train_text = state_union.raw("2005-GWBush.txt")
 sample_text = state_union.raw("2006-GWBush.txt")

 ##train the Punkt tokenizer
 custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

 tokenized = custom_sent_tokenizer.tokenize(sample_text)

 def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
 ##            print(tagged)
 ##
 ##            Chunking is done to extract meaningful 
 ##            Chunking on Adverbs, Noun (Singular) and Proper Noun
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
 ##            print(chunked)
 ##            chunked.draw()
 ##          "chunked" variable is an NLTK tree
 ##          Each "chunk" and "non chunk" is a "subtree" of the tree
 ##            for subtree in chunked.subtrees():
 ##                print(subtree)
 ##            Print the subtree with label Chunk that we assigned above
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

    except Exception as e:
        print(str(e))

 process_content()


 ##Chinking is a lot like chunking, it is basically a way for you to remove a
 ##chunk from a chunk.
 ##The chunk that you remove from your chunk is your chink.
 ##chunkGram = r"""Chunk: {<.*>+}
 ##                       }<VB.?|IN|DT|TO>+{"""
 ##This means we're removing from the chink one or more
 ##verbs, prepositions, determiners, or the word 'to'.
	##POS tagging is labeling words in a sentence as nouns, adjectives, verbs...etc
	import nltk
	from nltk.corpus import state_union
	from nltk.tokenize import PunktSentenceTokenizer

	##PunktSentenceTokenizer a new sentence tokenizer
	## This tokenizer is capable of unsupervised machine learning,
	##so you can actually train it on any body of text that you use

	##Creating training and testing data
	train_text = state_union.raw("2005-GWBush.txt")
	sample_text = state_union.raw("2006-GWBush.txt")

	##train the Punkt tokenizer
	custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

	tokenized = custom_sent_tokenizer.tokenize(sample_text)

	def process_content():
	try:
	for i in tokenized:
	words = nltk.word_tokenize(i)
	tagged = nltk.pos_tag(words)
	## print(tagged)
	##
	## Chunking is done to extract meaningful
	## Chunking on Adverbs, Noun (Singular) and Proper Noun
	chunkGram = r"""Chunk: {<RB.?><VB.?><NNP>+<NN>?}"""
	chunkParser = nltk.RegexpParser(chunkGram)
	chunked = chunkParser.parse(tagged)
	## print(chunked)
	## chunked.draw()
	## "chunked" variable is an NLTK tree
	## Each "chunk" and "non chunk" is a "subtree" of the tree
	## for subtree in chunked.subtrees():
	## print(subtree)
	## Print the subtree with label Chunk that we assigned above
	for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
	print(subtree)

	except Exception as e:
	print(str(e))

	process_content()


	##Chinking is a lot like chunking, it is basically a way for you to remove a
	##chunk from a chunk.
	##The chunk that you remove from your chunk is your chink.
	##chunkGram = r"""Chunk: {<.*>+}
	## }<VB.?\|IN\|DT\|TO>+{"""
	##This means we're removing from the chink one or more
	##verbs, prepositions, determiners, or the word 'to'.