Last active
July 27, 2024 14:47
-
-
Save jiahao87/d57a2535c2ed7315390920ea9296d79f to your computer and use it in GitHub Desktop.
Full code for preprocessing text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import spacy | |
import unidecode | |
from word2number import w2n | |
import contractions | |
nlp = spacy.load('en_core_web_md') | |
# exclude words from spacy stopwords list | |
deselect_stop_words = ['no', 'not'] | |
for w in deselect_stop_words: | |
nlp.vocab[w].is_stop = False | |
def strip_html_tags(text): | |
"""remove html tags from text""" | |
soup = BeautifulSoup(text, "html.parser") | |
stripped_text = soup.get_text(separator=" ") | |
return stripped_text | |
def remove_whitespace(text): | |
"""remove extra whitespaces from text""" | |
text = text.strip() | |
return " ".join(text.split()) | |
def remove_accented_chars(text): | |
"""remove accented characters from text, e.g. café""" | |
text = unidecode.unidecode(text) | |
return text | |
def expand_contractions(text): | |
"""expand shortened words, e.g. don't to do not""" | |
text = contractions.fix(text) | |
return text | |
def text_preprocessing(text, accented_chars=True, contractions=True, | |
convert_num=True, extra_whitespace=True, | |
lemmatization=True, lowercase=True, punctuations=True, | |
remove_html=True, remove_num=True, special_chars=True, | |
stop_words=True): | |
"""preprocess text with default option set to true for all steps""" | |
if remove_html == True: #remove html tags | |
text = strip_html_tags(text) | |
if extra_whitespace == True: #remove extra whitespaces | |
text = remove_whitespace(text) | |
if accented_chars == True: #remove accented characters | |
text = remove_accented_chars(text) | |
if contractions == True: #expand contractions | |
text = expand_contractions(text) | |
if lowercase == True: #convert all characters to lowercase | |
text = text.lower() | |
doc = nlp(text) #tokenise text | |
clean_text = [] | |
for token in doc: | |
flag = True | |
edit = token.text | |
# remove stop words | |
if stop_words == True and token.is_stop and token.pos_ != 'NUM': | |
flag = False | |
# remove punctuations | |
if punctuations == True and token.pos_ == 'PUNCT' and flag == True: | |
flag = False | |
# remove special characters | |
if special_chars == True and token.pos_ == 'SYM' and flag == True: | |
flag = False | |
# remove numbers | |
if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \ | |
and flag == True: | |
flag = False | |
# convert number words to numeric numbers | |
if convert_num == True and token.pos_ == 'NUM' and flag == True: | |
edit = w2n.word_to_num(token.text) | |
# convert tokens to base form | |
elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True: | |
edit = token.lemma_ | |
# append tokens edited and not removed to list | |
if edit != "" and flag == True: | |
clean_text.append(edit) | |
return clean_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment