Skip to content

Instantly share code, notes, and snippets.

@codingdudecom
Created November 1, 2023 06:23
NLP Python code
from js import fetch
import nltk
from nltk.util import ngrams
from pathlib import Path
import os, sys, io, zipfile
stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
stopwords = stopwords.split(",")
punkt_downloaded = False
async def download_punkt():
global punkt_downloaded
if not punkt_downloaded:
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
d = Path("/nltk_data/tokenizers")
d.mkdir(parents=True, exist_ok=True)
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
# extract punkt.zip
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
path='/nltk_data/tokenizers/'
)
punkt_downloaded = True
async def extract_keywords(text):
global punkt_downloaded
if not punkt_downloaded:
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
d = Path("/nltk_data/tokenizers")
d.mkdir(parents=True, exist_ok=True)
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
# extract punkt.zip
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
path='/nltk_data/tokenizers/'
)
punkt_downloaded = True
# check file contents in /nltk_data/tokenizers/
# print(os.listdir("/nltk_data/tokenizers/punkt"))
# return nltk.word_tokenize(text)
words = nltk.word_tokenize(text)
words = [word for word in words if word.isalnum()]
filtered_words = [word for word in words if word.lower() not in stopwords]
# Create bi-grams and tri-grams
bigrams = list(ngrams(filtered_words, 2))
trigrams = list(ngrams(filtered_words, 3))
quadgrams = list(ngrams(filtered_words, 4))
# Calculate frequency distributions for bi-grams and tri-grams
bigram_freq_dist = nltk.FreqDist(bigrams)
trigram_freq_dist = nltk.FreqDist(trigrams)
quadgram_freq_dist = nltk.FreqDist(quadgrams)
data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)
# Get the top N words
# top_keywords = [word for word, freq in word_freq.most_common(10)]
formatted_data = [[" ".join(keyword), count] for keyword, count in data]
return formatted_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment