This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train, test = train_test_split(data, test_size = 0.3) | |
cols = train.columns[:-1] | |
gnb = MultinomialNB() | |
gnb.fit(train[cols], train['sentiment']) | |
y_pred = gnb.predict(test[cols]) | |
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%" | |
.format( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pos_reviews = data[data['sentiment'] == 1] | |
neg_reviews = data[data['sentiment'] == 0] | |
pnum = np.array(pos_reviews[pos_reviews.columns].sum()) | |
nnum = np.array(neg_reviews[ntg_reviews.columns].sum()) | |
dif = pnum > nnum |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
word_matrix = [] | |
for i in lemmatized: word_matrix.append([1 if j in i else 0 for j in top5000]) | |
features = pd.DataFrame(word_matrix, columns = top5000, index = pd.DataFrame(filtered_tokens)) | |
features['sentiment'] = data['sentiment'].values |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from operator import itemgetter | |
from collections import Counter | |
flat_list = [i for sublist in filtered_tokens for i in sublist] | |
# Count how many times each word appears | |
count = Counter(flat_list).items() | |
sorted_count = sorted(count, key = itemgetter(1)) | |
sorted_count.reverse() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
en_stopwords = list(set(nltk.corpus.stopwords.words('english'))) | |
# remove punctuation from data | |
clean = [re.sub(r'[^\w\s]','',i).lower() for i in data] | |
tokens = [word_tokenize(x) for x in data['text']] | |
filtered_tokens = [] | |
# tokens that are not stopwords collected here | |
for i in tokens: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = pd.DataFrame(columns=['text', 'sentiment']) | |
for id in movie_reviews.fileids(): | |
text = ' '.join(movie_reviews.words(id)) | |
sentiment = 1 if movie_reviews.categories(id) == 'pos' else 0 | |
data = data.append(pd.DataFrame({'text': text,'sentiment': sentiment}, index=[0])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('all') | |
import regex as re | |
import pandas as pd | |
from sklearn.utils import shuffle | |
from nltk import LancasterStemmer | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import movie_reviews, stopwords | |
from sklearn.naive_bayes import MultinomialNB |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib.animation import FuncAnimation | |
p = np.linspace(-np.pi/2,np.pi/2,10) | |
x = np.sin(p) | |
v = np.column_stack((np.concatenate((x,x)),np.concatenate((np.cos(p),-np.cos(p))),[1]*len(p)*2)) |