Created
April 23, 2020 22:38
-
-
Save SandieIJ/3709576e5d7553dd59fb6344002e4dce to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**GENERATING N-GRAMS**\n", | |
"\n", | |
"In order for our models to infer the correct meanings from words, it is important to identify n-grams in the text data you are training your model on." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#break down sentences into words\n", | |
"def sent_to_words(sentences): \n", | |
" for sentence in sentences:\n", | |
" yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", | |
"\n", | |
"data_words = list(sent_to_words(no_quotes))\n", | |
"\n", | |
"# Build the bigram and trigram models\n", | |
"bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) \n", | |
"trigram = gensim.models.Phrases(bigram[no_quotes], threshold=100)\n", | |
"\n", | |
"# Faster way to get a sentence clubbed as a trigram/bigram\n", | |
"bigram_mod = gensim.models.phrases.Phraser(bigram) \n", | |
"\n", | |
"trigram_mod = gensim.models.phrases.Phraser(trigram)\n", | |
"\n", | |
"def make_bigrams(texts):\n", | |
" return [bigram_mod[doc] for doc in texts]\n", | |
"\n", | |
"def make_trigrams(texts):\n", | |
" return [trigram_mod[bigram_mod[doc]] for doc in texts]\n", | |
"\n", | |
"# Form Bigrams\n", | |
"data_words_bigrams = make_bigrams(data_words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['brick_breaker', 'is', 'single', 'tap', 'hyper_casual', 'game', 'that', 'will', 'keep', 'you', 'hooked', 'for', 'hours', 'hold', 'the', 'screen', 'to', 'aim', 'swipe', 'the', 'ball', 'to', 'the', 'brick', 'and', 'break', 'all', 'the', 'bricks', 'easily', 'the', 'game', 'features', 'unlimited', 'levels', 'and', 'beautiful', 'color', 'balls']\n" | |
] | |
} | |
], | |
"source": [ | |
"# preview of a description with bigrams identified\n", | |
"print(data_words_bigrams[10])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment