Skip to content

Instantly share code, notes, and snippets.

@ryanpadilha
Created April 18, 2025 22:22
Show Gist options
  • Save ryanpadilha/955efe2c5438a15604f77d6c746179a1 to your computer and use it in GitHub Desktop.
Save ryanpadilha/955efe2c5438a15604f77d6c746179a1 to your computer and use it in GitHub Desktop.
# 1. Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["gato dormindo", "cachorro latindo", "gato e cachorro brincando"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocabulário:", vectorizer.get_feature_names_out())
print("Matriz de contagem (Bag of Words):")
print(X.toarray())
# 2. TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocabulário:", vectorizer.get_feature_names_out())
print("Matriz TF-IDF:")
print(X.toarray())
# 3. Word2Vec (Skip-gram)
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
# Pré-processamento simples
corpus = ["gato dormindo", "cachorro latindo", "gato e cachorro brincando"]
tokenized = [word_tokenize(frase.lower()) for frase in corpus]
# Treinando modelo Skip-gram
model = Word2Vec(sentences=tokenized, vector_size=10, window=2, sg=1, min_count=1)
# Vetores das palavras
for word in model.wv.index_to_key:
print(f"Vetor para '{word}': {model.wv[word]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment