Created
April 23, 2020 22:40
-
-
Save SandieIJ/4af9e44d9bd9e73ab39d7ec00962bd6b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**LEMMATIZATION**\n", | |
"\n", | |
"Lemmatization is the process of attempting to identify and structure any relationships contained in the given tokenized document to accurately identify the lemma, which is the dictionary form of a word, including nouns, adjectives, verbs, and adverbs." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[['ready', 'play', 'amazing', 'exciting', 'good', 'shooting', 'game', 'fire', 'shoot', 'game', 'war', 'shooting', 'game', 'free', 'unknown', 'battle', 'strike', 'free', 'survival', 'mission', 'free', 'fire', 'unknown', 'shoot', 'action', 'game', 'face', 'dangerous', 'death', 'mission', 'exciting', 'survival', 'free', 'firing_squad', 'free', 'fire', 'shooting', 'game', 'commando', 'shoot', 'survival', 'game', 'army', 'soldier', 'crazy', 'challenging', 'shooting', 'arena', 'where', 'training', 'face', 'crazy', 'dangerous', 'death', 'mission', 'enemy', 'free', 'fire', 'shoot', 'unknown', 'battleground', 'mission', 'best', 'offline', 'shoot', 'game', 'commando', 'training', 'skill', 'squad', 'survival', 'mission', 'battleground', 'survival', 'free', 'fire', 'game', 'depend', 'war', 'shoot', 'squad', 'free', 'fire', 'battleground', 'war', 'battleground', 'game', 'army', 'last', 'player', 'firing_squad', 'face', 'crazy', 'death', 'mission', 'legend', 'fire', 'fire', 'free', 'fire', 'battleground', 'battleground', 'cross', 'fire', 'surgical_strike', 'fill', 'fierce', 'shooting', 'game', 'training', 'skill', 'fire', 'battleground', 'game', 'world', 'war', 'mission', 'where', 'commando', 'mission', 'good', 'shooting', 'survival', 'unknown', 'battle', 'strike', 'control', 'wait', 'sniper', 'shooting', 'skill', 'start', 'survival', 'battleground', 'strike', 'journey', 'modern', 'weapon', 'free', 'fire', 'survival', 'shoot', 'mission', 'sniper', 'gun', 'other', 'shoot', 'battlefield', 'weapon', 'graphic', 'real', 'firing_squad', 'mind_blowing', 'fire', 'squad', 'survival', 'mission', 'survival', 'strike', 'journey', 'legend', 'battle', 'strike', 'game', 'good', 'shooting', 'game', 'lot', 'gun', 'see', 'game', 'feel', 'good', 'gun', 'game', 'show', 'world', 'war', 'commando', 'training', 'skill', 'modern', 'weapon', 'sniper', 'gun', 'unknown', 'enemy', 'squad', 'commando', 'training', 'skill', 'free', 'fire', 'battleground', 'feature', 'variety', 'weapon', 'available', 'free', 'fire', 'shoot', 'missionsdozen', 'mission', 'war', 'shoot', 'squadreal', 'enemy', 'terrorist', 'ai', 'unknown_battleground', 'environment', 'system', 'detect', 'enemy', 'position', 'surgical', 'strikesimple', 'smooth', 'control', 'download', 'play', 'store', 'good', 'legend', 'free', 'fire', 'totally', 'free']]\n" | |
] | |
} | |
], | |
"source": [ | |
"# Initialize spacy\n", | |
"nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n", | |
"\n", | |
"def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): \n", | |
" \"\"\"https://spacy.io/api/annotation\"\"\"\n", | |
" texts_out = []\n", | |
" for sent in texts:\n", | |
" doc = nlp(\" \".join(sent))\n", | |
" texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) \n", | |
" return texts_out\n", | |
"\n", | |
"# Perform lemmatization keeping only nouns, adjectives, verbs and adjectives\n", | |
"data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) \n", | |
"\n", | |
"# preview of lemmatized data taking into account nouns, adverbs, verbs and adjectives\n", | |
"print(data_lemmatized[:1])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment