himangSharatun · May 31, 2018 03:21
diff --git a/w2v-training.py b/w2v-training.py
 import pandas as pd
 from gensim.models import Word2Vec
 import logging
 import re

 # Diplay log
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

 # Load data
 data = pd.read_csv('training-data.csv', header=None)

 # Preprocess text
 def tokenize(sentence):
    remove_dots = re.sub("[.]", "", sentence.lower())
    return re.findall("[A-Za-z]{2,}", remove_dots)

 sentences = []
 for sentence in data[0].values:
    sentences.append(tokenize(sentence))

 # Train and save word2vec model
 model = Word2Vec(sentences, min_count=5, size=100, iter=500, seed=96)
 model.save('word2vec.bin')

 # Load and test word2vec model
 loaded_w2v = Word2Vec.load('word2vec.bin')
 print(loaded_w2v.wv.most_similar(positive=['pilot']))
 print(loaded_w2v.wv['pilot'])
	import pandas as pd
	from gensim.models import Word2Vec
	import logging
	import re

	# Diplay log
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	# Load data
	data = pd.read_csv('training-data.csv', header=None)

	# Preprocess text
	def tokenize(sentence):
	remove_dots = re.sub("[.]", "", sentence.lower())
	return re.findall("[A-Za-z]{2,}", remove_dots)

	sentences = []
	for sentence in data[0].values:
	sentences.append(tokenize(sentence))

	# Train and save word2vec model
	model = Word2Vec(sentences, min_count=5, size=100, iter=500, seed=96)
	model.save('word2vec.bin')

	# Load and test word2vec model
	loaded_w2v = Word2Vec.load('word2vec.bin')
	print(loaded_w2v.wv.most_similar(positive=['pilot']))
	print(loaded_w2v.wv['pilot'])