inishchith · January 31, 2018 15:33
diff --git a/HIstop_words.py b/HIstop_words.py
 """
 StopList for classical_hindi corpora at CLTK.
 """

 import re,os
 import string
 from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
 from nltk.probability import FreqDist
 import nltk

 path = "./hindi_corpora"        # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir.

 if os.path.isfile("./stop_words.txt"):
    os.remove("./stop_words.txt")
 if os.path.isfile(path+"/hindi_corpora.txt"):
    os.remove(path+"/hindi_corpora.txt")

 punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''          
 extra_punctuation = '||'

 total_words = 0

 for file_name in os.listdir(path):
    full_path = os.path.join(path, file_name)
    file_content = open(full_path, encoding="utf-8",errors='ignore').read()
    print(full_path)
    texts = ""
    for char in file_content:
        if char not in punctuation + extra_punctuation:
            texts = texts + char
    words = i_word(texts)
    file = open(path+"/hindi_corpora.txt", 'a+')
    total_words += len(list(words))
    word_string = '\n'.join(words)
    file.write(word_string)
    #hindi_words = hindi_words + list(words)

 print("TOTAL WORDS : ",total_words)

 with open(path + "/hindi_corpora.txt") as fi:
    hindi_words = fi.read().splitlines()

 os.remove(path+"/hindi_corpora.txt")
 words = hindi_words
 fdist = FreqDist(words)
 common_words = fdist.most_common(200)
 file = open('./stops_words.txt', 'a+')
 commonWord_list = [x[0] for x in common_words]
 commonWord_list = list(set(commonWord_list))
 common_words = '\n'.join(commonWord_list)
 file.write(common_words)
 print(commonWord_list)

 """
 Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed.
 This can be avoided as there's a scope of improvement in the punctuation regular expression .
 """
	"""
	StopList for classical_hindi corpora at CLTK.
	"""

	import re,os
	import string
	from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
	from nltk.probability import FreqDist
	import nltk

	path = "./hindi_corpora" # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir.

	if os.path.isfile("./stop_words.txt"):
	os.remove("./stop_words.txt")
	if os.path.isfile(path+"/hindi_corpora.txt"):
	os.remove(path+"/hindi_corpora.txt")

	punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''
	extra_punctuation = '\|\|'

	total_words = 0

	for file_name in os.listdir(path):
	full_path = os.path.join(path, file_name)
	file_content = open(full_path, encoding="utf-8",errors='ignore').read()
	print(full_path)
	texts = ""
	for char in file_content:
	if char not in punctuation + extra_punctuation:
	texts = texts + char
	words = i_word(texts)
	file = open(path+"/hindi_corpora.txt", 'a+')
	total_words += len(list(words))
	word_string = '\n'.join(words)
	file.write(word_string)
	#hindi_words = hindi_words + list(words)

	print("TOTAL WORDS : ",total_words)

	with open(path + "/hindi_corpora.txt") as fi:
	hindi_words = fi.read().splitlines()

	os.remove(path+"/hindi_corpora.txt")
	words = hindi_words
	fdist = FreqDist(words)
	common_words = fdist.most_common(200)
	file = open('./stops_words.txt', 'a+')
	commonWord_list = [x[0] for x in common_words]
	commonWord_list = list(set(commonWord_list))
	common_words = '\n'.join(commonWord_list)
	file.write(common_words)
	print(commonWord_list)

	"""
	Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed.
	This can be avoided as there's a scope of improvement in the punctuation regular expression .
	"""