Skip to content

Instantly share code, notes, and snippets.

@wnghdcjfe
Created April 8, 2018 08:47
Show Gist options
  • Save wnghdcjfe/6769c826700f7e2bac6569b994b4113d to your computer and use it in GitHub Desktop.
Save wnghdcjfe/6769c826700f7e2bac6569b994b4113d to your computer and use it in GitHub Desktop.
study for week2
import numpy as np
import matplotlib.pyplot as plt
class SentiAnalyzer:
'''
'''
# Make the method signature to accept "sentidata" and "word"
def __init__(self, sentidata, word):
self.sentidata = sentidata # Original Dataset
self.numTraining = 150 # number of Training
self.wordLimit = 1500 # number of words of interests
self.dataWord = word # list of words
print('This is a senti analyzer')
def runAnalysis(self, idxReview):
probLogPositive = 0
probLogNegative = 0
idxUsedWords, usedWords = self.findUsedWords(idxReview)
for i in range(len(idxUsedWords)):
idxWord = idxUsedWords[i]
positive, negative = self.calculateProbWord(idxWord)
probLogPositive = probLogPositive + np.log(positive)
probLogNegative = probLogNegative + np.log(negative)
positiveProb1, negativeProb1 = self.calculateProbReview()
probLogPositive = probLogPositive + np.log(positiveProb1)
probLogNegative = probLogNegative + np.log(negativeProb1)
# return correct as 1 if the review is positive and the analysis is positive and if the review is negative and the analysis is negative
# return correct as 0 otherwise
# self.dataReviewTesting stores the correct review result by specifying 1 as a positive review
if self.dataReviewTesting[idxReview] == 1:
if probLogPositive > probLogNegative: #1
correct = 1 #2
else:
correct = 0 #3
else:
if probLogNegative > probLogPositive: #4
correct = 1 #5
else:
correct = 0 #6
return probLogPositive, probLogNegative, correct
def runWholeAnalysis(self):
cnt = 0
numCorrect = np.zeros((int(self.numTraining/30) + 1, 1))
# for loop with 0, 30, 60, 90, 120, 150
# make
# numCorrect(0) = (sum of correct cases for 0 case) / (size of testing which is 1 in the current iteration)
# numCorrect(1) = (sum of correct cases for 30 case) / (size of testing which is 30 in the current iteration)
# and so on...
for j in range(0, self.numTraining+1, 30): #7
self.dataSentimentTraining = self.sentidata[self.shuffle[0:j+1], 0:self.wordLimit]
self.dataReviewTraining = self.sentidata[self.shuffle[0:j+1], -1]
numCorrect[cnt] = 0
for i in range(np.shape(self.dataSentimentTesting)[0]):
p, n, c = self.runAnalysis(i)
if c == 1:
numCorrect[cnt] += 1
numCorrect[cnt] = numCorrect[cnt] / np.shape(self.dataSentimentTesting)[0]
cnt += 1
return numCorrect
def runExperiments(self, numReplicate):
average = np.zeros((int(self.numTraining/30 + 1), 1))
averageSq = np.zeros((int(self.numTraining/30 + 1), 1))
# iterate by the numReplicate
for i in range(numReplicate):
self.shuffle = np.arange(np.shape(self.sentidata)[0])
np.random.shuffle(self.shuffle)
self.dataSentimentTesting = self.sentidata[self.shuffle[self.numTraining+1:198], 0:self.wordLimit]
self.dataReviewTesting = self.sentidata[self.shuffle[self.numTraining + 1:198], -1]
# receive the correct information from runWholeAnalysis()
correct = self.runWholeAnalysis()
# calculate the average by the training case sizes
average = average + correct #8
# calculate the squared average by the training case sizes
averageSq += correct * correct #9
# finish the calculation of average
average = average / numReplicate #10
# finish the calculation of average squared
averageSq = averageSq / numReplicate #11
# finish the calculation of standard deviation
std = np.sqrt(averageSq - average*average) #12
plt.errorbar(np.arange(0, self.numTraining+1, 30), average, std)
plt.title('Product Review Classification')
plt.xlabel('Number of Cases')
plt.ylabel('Percentage of Correct Classification')
plt.show()
def calculateProbWord(self, idxWord):
# transpose : 상하좌우 반전
occurrence = [[row[idxWord]] for row in self.dataSentimentTraining]
positive = np.matmul(np.transpose(occurrence), self.dataReviewTraining)
dataNegReviewTraining = [[1-row] for row in self.dataReviewTraining]
negative = np.matmul(np.transpose(occurrence), dataNegReviewTraining)
positiveProb = int(positive+1) / float(positive+negative+1)
negativeProb = int(negative+1) / float(positive+negative+1)
return positiveProb, negativeProb
def calculateProbReview(self):
numReviews = max(np.shape(self.dataReviewTraining))
positive = np.sum(self.dataReviewTraining)
negative = numReviews - positive
positiveProb = int(positive + 1) / float(numReviews + 1)
negativeProb = int(negative + 1) / float(numReviews + 1)
return positiveProb, negativeProb
def findUsedWords(self, idx):
idxUsedWords = np.where(self.dataSentimentTesting[idx] == 1)[0]
usedWords = self.dataWord[idxUsedWords]
return idxUsedWords, usedWords
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment