-
-
Save liuyijiang1994/491b2af4c124880d497c88ae038bdbaa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ekphrasis.classes.preprocessor import TextPreProcessor | |
from ekphrasis.classes.tokenizer import SocialTokenizer | |
from ekphrasis.dicts.emoticons import emoticons | |
import numpy as np | |
import re | |
import io | |
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"} | |
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} | |
emoticons_additional = { | |
'(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>', | |
':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>', | |
':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':": | |
'<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>', | |
} | |
text_processor = TextPreProcessor( | |
# terms that will be normalized | |
normalize=['url', 'email', 'percent', 'money', 'phone', 'user', | |
'time', 'url', 'date', 'number'], | |
# terms that will be annotated | |
annotate={"hashtag", "allcaps", "elongated", "repeated", | |
'emphasis', 'censored'}, | |
fix_html=True, # fix HTML tokens | |
# corpus from which the word statistics are going to be used | |
# for word segmentation | |
segmenter="twitter", | |
# corpus from which the word statistics are going to be used | |
# for spell correction | |
corrector="twitter", | |
unpack_hashtags=True, # perform word segmentation on hashtags | |
unpack_contractions=True, # Unpack contractions (can't -> can not) | |
spell_correct_elong=True, # spell correction for elongated words | |
# select a tokenizer. You can use SocialTokenizer, or pass your own | |
# the tokenizer, should take as input a string and return a list of tokens | |
tokenizer=SocialTokenizer(lowercase=True).tokenize, | |
# list of dictionaries, for replacing tokens extracted from the text, | |
# with other expressions. You can pass more than one dictionaries. | |
dicts=[emoticons, emoticons_additional] | |
) | |
def tokenize(text): | |
text = " ".join(text_processor.pre_process_doc(text)) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment