Skip to content

Instantly share code, notes, and snippets.

@yvki
Created May 1, 2024 08:53
Show Gist options
  • Save yvki/cf95cbd5c582bc3ac7b97c48d6239a06 to your computer and use it in GitHub Desktop.
Save yvki/cf95cbd5c582bc3ac7b97c48d6239a06 to your computer and use it in GitHub Desktop.
Naive Bayes Text Classification Pipeline βš™οΈ for Sentiment Analysis (or similar) tasks πŸ“©
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv('text_dataset.csv')
def clean_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
cleaned_text = ' '.join(words)
return cleaned_text
df['cleaned_text'] = df['text'].apply(clean_text)
print(df['cleaned_text'].head())
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = Pipeline([
('tfidf', TfidfVectorizer()),
('nb', MultinomialNB())
])
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
classification = classification_report(y_test, predictions)
print("Accuracy Score: ", accuracy, "\nClassification Report: ", classification)
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation Mean Accuracy:", cv_scores.mean())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment