Skip to content

Instantly share code, notes, and snippets.

@islem-esi
Created April 23, 2022 11:18
Show Gist options
  • Save islem-esi/9ff5b567f17398ad2b02d132346ecc44 to your computer and use it in GitHub Desktop.
Save islem-esi/9ff5b567f17398ad2b02d132346ecc44 to your computer and use it in GitHub Desktop.
from nltk.tokenize import word_tokenize
from tokenize import tokenize
from io import BytesIO
rename_globals = {}
def tokenize_python(code):
g = tokenize(BytesIO(code.encode('utf-8')).readline)
try:
tokens = [c[1] for c in g if c[1]!='' and c[1]!='\n'][1:]
except:
tokens = tokenize_if_block(code)
clean_tokens = []
for t in tokens:
if ' ' in t:
for rg in rename_globals:
t = t.replace(rg, rename_globals[rg])
clean_tokens += tokenize_if_block(t.replace('"', '').replace("'", ''))
else:
clean_tokens.append(t)
return clean_tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment