Created
January 31, 2018 15:33
-
-
Save inishchith/ad4bc0da200110de638f5408c64bb14c to your computer and use it in GitHub Desktop.
This list is composed from 100 most frequently occuring words in classical_hindi corpus <https://github.com/cltk/hindi_text_ltrc> in CLTK.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
StopList for classical_hindi corpora at CLTK. | |
""" | |
import re,os | |
import string | |
from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word | |
from nltk.probability import FreqDist | |
import nltk | |
path = "./hindi_corpora" # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir. | |
if os.path.isfile("./stop_words.txt"): | |
os.remove("./stop_words.txt") | |
if os.path.isfile(path+"/hindi_corpora.txt"): | |
os.remove(path+"/hindi_corpora.txt") | |
punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~''' | |
extra_punctuation = '||' | |
total_words = 0 | |
for file_name in os.listdir(path): | |
full_path = os.path.join(path, file_name) | |
file_content = open(full_path, encoding="utf-8",errors='ignore').read() | |
print(full_path) | |
texts = "" | |
for char in file_content: | |
if char not in punctuation + extra_punctuation: | |
texts = texts + char | |
words = i_word(texts) | |
file = open(path+"/hindi_corpora.txt", 'a+') | |
total_words += len(list(words)) | |
word_string = '\n'.join(words) | |
file.write(word_string) | |
#hindi_words = hindi_words + list(words) | |
print("TOTAL WORDS : ",total_words) | |
with open(path + "/hindi_corpora.txt") as fi: | |
hindi_words = fi.read().splitlines() | |
os.remove(path+"/hindi_corpora.txt") | |
words = hindi_words | |
fdist = FreqDist(words) | |
common_words = fdist.most_common(200) | |
file = open('./stops_words.txt', 'a+') | |
commonWord_list = [x[0] for x in common_words] | |
commonWord_list = list(set(commonWord_list)) | |
common_words = '\n'.join(commonWord_list) | |
file.write(common_words) | |
print(commonWord_list) | |
""" | |
Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed. | |
This can be avoided as there's a scope of improvement in the punctuation regular expression . | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment