Skip to content

Instantly share code, notes, and snippets.

@sameeramin
Created May 2, 2021 14:32
Show Gist options
  • Save sameeramin/8183a17f0f0912a0987a389f095e72df to your computer and use it in GitHub Desktop.
Save sameeramin/8183a17f0f0912a0987a389f095e72df to your computer and use it in GitHub Desktop.
Doc2Vec Based Article Recommender
# Import libraries and modules modules.
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
# Create new sentence and vectorize it.
new_sentence = "this is a new sentence".split(" ")
new_sentence_vectorized = model.infer_vector(new_sentence)
# Calculate cosine similarity.
similar_sentences = model.docvecs.most_similar(positive=[new_sentence_vectorized])
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
model = Doc2Vec.load("articles.model")
common_texts = [word_tokenize(sw_removed.lower()) for sw_removed in common_texts if not sw_removed in stopwords.words()]
common_texts
file = pd.read_csv("articles.csv")
common_texts_pre = file["abstract"]
common_texts = common_texts_pre[:80]
common_texts
import pandas as pd
# Output
output = []
for i, v in enumerate(similar_sentences):
index = v[0]
output.append([common_texts_pre[index], v[1]])
pd.DataFrame(output, columns=["common_texts", "cosine_similarity"])
# Tagged documents are input for doc2vec model.
tagged_data = []
for i, doc in enumerate(common_texts):
tagged = TaggedDocument(doc, [i])
tagged_data.append(tagged)
tagged_data
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(vector_size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
dm=1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration{0}'.format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.epochs)
# Decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save("articles.model")
print("Model Saved")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment