Created
May 31, 2018 03:21
-
-
Save himangSharatun/f96f765da807689a1d99b66f44e45311 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from gensim.models import Word2Vec | |
import logging | |
import re | |
# Diplay log | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
# Load data | |
data = pd.read_csv('training-data.csv', header=None) | |
# Preprocess text | |
def tokenize(sentence): | |
remove_dots = re.sub("[.]", "", sentence.lower()) | |
return re.findall("[A-Za-z]{2,}", remove_dots) | |
sentences = [] | |
for sentence in data[0].values: | |
sentences.append(tokenize(sentence)) | |
# Train and save word2vec model | |
model = Word2Vec(sentences, min_count=5, size=100, iter=500, seed=96) | |
model.save('word2vec.bin') | |
# Load and test word2vec model | |
loaded_w2v = Word2Vec.load('word2vec.bin') | |
print(loaded_w2v.wv.most_similar(positive=['pilot'])) | |
print(loaded_w2v.wv['pilot']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment