shengch02 · January 5, 2017 02:06
diff --git a/Stochastic gradient ascent b/Stochastic gradient ascent
 #Training logistic regression via stochastic gradient ascent
 import math
 import pandas as pd
 import numpy as np

 #the dataset consists a subset of baby product reviews on Amazon.com
 import sframe
 products = sframe.SFrame('amazon_baby_subset.gl/')
 products = sframe.SFrame.to_dataframe(products)

 #load 193 most frequent words from a JSON file
 import json
 with open('important_words.json') as json_data:
 	important_words = json.load(json_data)
 important_words = [str(x) for x in important_words]

 #fill n/a values
 products['review'] = products['review'].fillna('')

 #remove punctuations
 def remove_punctuation(text):
 	import string
 	return text.translate(None, string.punctuation)
 products['review_clean'] = products['review'].apply(remove_punctuation)

 #count the occurance of the words
 for word in important_words:
 	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

 #split the data into a 90-10 split with 90% in the training set
 products=sframe.SFrame(products)
 train_data, validation_data = products.random_split(0.9, seed=1)

 #convert the dataframe to a multi-dimensional array
 def get_numpy_data(dataframe, features, label):
 	dataframe['constant'] = 1
 	features = ['constant']+features
 	features_frame = sframe.SFrame.to_dataframe(dataframe[features])
 	feature_matrix = features_frame.as_matrix()
 	return(feature_matrix, np.array(dataframe[label]))
 feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment') 
 feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
 	#194 features(including intercept)

 #estimate conditional probability with link function
 def predict_probability(feature_matrix, coefficients):
 	#dot product of feature and coefficients
 	score = np.dot(feature_matrix, coefficients)
 	#compute probability using the link function
 	predictions = 1.0/(1.0+np.exp(-score)) 
 	return predictions

 #compute derivative of log likelihood with respect to a single coefficient
 def feature_derivative(errors, feature):
 	derivative = np.dot(errors, feature)
 	return derivative

 #compute average log-likelihood
 def compute_avg_log_likelihood(feature_matrix, sentiment, coefficients):
 	indicator = (sentiment==+1)
 	scores = np.dot(feature_matrix, coefficients)
 	logexp = np.log(1.0+np.exp(-scores))
 	#simple check to prevent overflow
 	mask = np.isinf(logexp)
 	logexp[mask] = -scores[mask]
 	lp = np.sum((indicator-1)*scores-logexp)/float(len(feature_matrix))
 	return lp 

 #compute the gradient for a single data point
 j = 1
 i = 10
 coefficients = np.zeros(194)
 predictions = predict_probability(feature_matrix_train[i:i+1,:], coefficients)
 indicator = (sentiment_train[i:i+1]==1)
 errors = indicator-predictions
 gradient_single_data_point = feature_derivative(errors, feature_matrix_train[i:i+1,j])
 print 'Gradient single data point: %s' % gradient_single_data_point

 #modifying the derivative for using a batch of data points
 j = 1
 i = 10 
 B = 10
 coefficients = np.zeros(194)
 predictions = predict_probability(feature_matrix_train[i:i+B,:], coefficients)
 indicator = (sentiment_train[i:i+B]==1)
 errors = indicator - predictions
 gradient_mini_batch = feature_derivative(errors, feature_matrix_train[i:i+B,j])
 print 'Gradient mini-batch data points: %s' % gradient_mini_batch

 #implement stochastic gradient ascent
 def logistic_regression_SG(feature_matrix, sentiment, initial_coefficients, step_size,
 		batch_size, max_iter):
 	log_likelihood_all = []
 	coefficients = np.array(initial_coefficients)
 	np.random.seed(seed=1)
 	permutation = np.random.permutation(len(feature_matrix))
 	feature_matrix = feature_matrix[permutation,:]
 	sentiment = sentiment[permutation]
 	i = 0
 	for itr in xrange(max_iter):
 		predictions = predict_probability(feature_matrix[i:i+batch_size,:], coefficients)
 		indicator = (sentiment[i:i+batch_size]==1)
 		errors = indicator - predictions
 		for j in xrange(len(coefficients)):
 			derivative = feature_derivative(errors, feature_matrix[i:i+batch_size,j])
 			coefficients[j] += derivative*step_size/batch_size
 		lp = compute_avg_log_likelihood(feature_matrix[i:i+batch_size,:], sentiment[i:i+batch_size],
 			coefficients)
 		log_likelihood_all.append(lp)
 		if itr<= 15 or (itr<=1000 and itr%100==0) or (itr<=10000 and itr%1000==0) or \
 			itr%10000==0 or itr==max_iter-1:
 			data_size = len(feature_matrix)
 			print 'Iteration %*d: Average log likelihood %0*d : %0*d = %.8f' % \
 			(int(np.ceil(np.log10(max_iter))), itr, \
                 	 int(np.ceil(np.log10(data_size))), i, \
               		 int(np.ceil(np.log10(data_size))), i+batch_size, lp)
 		i += batch_size
 		if (i+batch_size)>len(feature_matrix):
 			permutation = np.random.permutation(len(feature_matrix))
 			feature_matrix = feature_matrix[permutation,:]
 			sentiment = sentiment[permutation]
 			i = 0
 	return coefficients, log_likelihood_all

 # stochastic gradient ascent
 initial_coefficients = np.zeros(194)
 step_size = 5e-1
 batch_size = 1
 max_iter = 10
 coefficients_sg, log_likelihood_sg = logistic_regression_SG(feature_matrix_train, sentiment_train, 
 	initial_coefficients, step_size, batch_size, max_iter)
 print log_likelihood_sg
 
 #batch gradient ascent
 initial_coefficients = np.zeros(194)
 step_size = 5e-1
 batch_size = len(feature_matrix_train)
 max_iter = 200
 coefficients_batch, log_likelihood_batch = logistic_regression_SG(feature_matrix_train, sentiment_train, 
 	initial_coefficients, step_size, batch_size, max_iter)
 print log_likelihood_batch

 #compare stochastic gradient ascent and batch gradient ascent
 step_size = 1e-1
 batch_size = 100
 initial_coefficients = np.zeros(194)
 passes = 10
 coefficients_all, log_likelihood_all = logistic_regression_SG(feature_matrix_train, sentiment_train, 
 	initial_coefficients, step_size, batch_size, passes*len(feature_matrix_train)/batch_size)
 import matplotlib.pyplot as plt
 def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''):
 	plt.rcParams.update({'figure.figsize':(9,5)})
 	log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all),\
 					    np.ones((smoothing_window,))/smoothing_window, mode='valid')
 	plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data,\
 		log_likelihood_all_ma, linewidth=2.0, label=label)
 	plt.xlabel('# of passes over data')
 	plt.ylabel('Average log likelihood per data point')
 	plt.show()
 make_plot(log_likelihood_all, len(feature_matrix_train), batch_size)
	#Training logistic regression via stochastic gradient ascent
	import math
	import pandas as pd
	import numpy as np

	#the dataset consists a subset of baby product reviews on Amazon.com
	import sframe
	products = sframe.SFrame('amazon_baby_subset.gl/')
	products = sframe.SFrame.to_dataframe(products)

	#load 193 most frequent words from a JSON file
	import json
	with open('important_words.json') as json_data:
	important_words = json.load(json_data)
	important_words = [str(x) for x in important_words]

	#fill n/a values
	products['review'] = products['review'].fillna('')

	#remove punctuations
	def remove_punctuation(text):
	import string
	return text.translate(None, string.punctuation)
	products['review_clean'] = products['review'].apply(remove_punctuation)

	#count the occurance of the words
	for word in important_words:
	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

	#split the data into a 90-10 split with 90% in the training set
	products=sframe.SFrame(products)
	train_data, validation_data = products.random_split(0.9, seed=1)

	#convert the dataframe to a multi-dimensional array
	def get_numpy_data(dataframe, features, label):
	dataframe['constant'] = 1
	features = ['constant']+features
	features_frame = sframe.SFrame.to_dataframe(dataframe[features])
	feature_matrix = features_frame.as_matrix()
	return(feature_matrix, np.array(dataframe[label]))
	feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
	feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
	#194 features(including intercept)

	#estimate conditional probability with link function
	def predict_probability(feature_matrix, coefficients):
	#dot product of feature and coefficients
	score = np.dot(feature_matrix, coefficients)
	#compute probability using the link function
	predictions = 1.0/(1.0+np.exp(-score))
	return predictions

	#compute derivative of log likelihood with respect to a single coefficient
	def feature_derivative(errors, feature):
	derivative = np.dot(errors, feature)
	return derivative

	#compute average log-likelihood
	def compute_avg_log_likelihood(feature_matrix, sentiment, coefficients):
	indicator = (sentiment==+1)
	scores = np.dot(feature_matrix, coefficients)
	logexp = np.log(1.0+np.exp(-scores))
	#simple check to prevent overflow
	mask = np.isinf(logexp)
	logexp[mask] = -scores[mask]
	lp = np.sum((indicator-1)*scores-logexp)/float(len(feature_matrix))
	return lp

	#compute the gradient for a single data point
	j = 1
	i = 10
	coefficients = np.zeros(194)
	predictions = predict_probability(feature_matrix_train[i:i+1,:], coefficients)
	indicator = (sentiment_train[i:i+1]==1)
	errors = indicator-predictions
	gradient_single_data_point = feature_derivative(errors, feature_matrix_train[i:i+1,j])
	print 'Gradient single data point: %s' % gradient_single_data_point

	#modifying the derivative for using a batch of data points
	j = 1
	i = 10
	B = 10
	coefficients = np.zeros(194)
	predictions = predict_probability(feature_matrix_train[i:i+B,:], coefficients)
	indicator = (sentiment_train[i:i+B]==1)
	errors = indicator - predictions
	gradient_mini_batch = feature_derivative(errors, feature_matrix_train[i:i+B,j])
	print 'Gradient mini-batch data points: %s' % gradient_mini_batch

	#implement stochastic gradient ascent
	def logistic_regression_SG(feature_matrix, sentiment, initial_coefficients, step_size,
	batch_size, max_iter):
	log_likelihood_all = []
	coefficients = np.array(initial_coefficients)
	np.random.seed(seed=1)
	permutation = np.random.permutation(len(feature_matrix))
	feature_matrix = feature_matrix[permutation,:]
	sentiment = sentiment[permutation]
	i = 0
	for itr in xrange(max_iter):
	predictions = predict_probability(feature_matrix[i:i+batch_size,:], coefficients)
	indicator = (sentiment[i:i+batch_size]==1)
	errors = indicator - predictions
	for j in xrange(len(coefficients)):
	derivative = feature_derivative(errors, feature_matrix[i:i+batch_size,j])
	coefficients[j] += derivative*step_size/batch_size
	lp = compute_avg_log_likelihood(feature_matrix[i:i+batch_size,:], sentiment[i:i+batch_size],
	coefficients)
	log_likelihood_all.append(lp)
	if itr<= 15 or (itr<=1000 and itr%100==0) or (itr<=10000 and itr%1000==0) or \
	itr%10000==0 or itr==max_iter-1:
	data_size = len(feature_matrix)
	print 'Iteration %d: Average log likelihood %0d : %0*d = %.8f' % \
	(int(np.ceil(np.log10(max_iter))), itr, \
	int(np.ceil(np.log10(data_size))), i, \
	int(np.ceil(np.log10(data_size))), i+batch_size, lp)
	i += batch_size
	if (i+batch_size)>len(feature_matrix):
	permutation = np.random.permutation(len(feature_matrix))
	feature_matrix = feature_matrix[permutation,:]
	sentiment = sentiment[permutation]
	i = 0
	return coefficients, log_likelihood_all

	# stochastic gradient ascent
	initial_coefficients = np.zeros(194)
	step_size = 5e-1
	batch_size = 1
	max_iter = 10
	coefficients_sg, log_likelihood_sg = logistic_regression_SG(feature_matrix_train, sentiment_train,
	initial_coefficients, step_size, batch_size, max_iter)
	print log_likelihood_sg

	#batch gradient ascent
	initial_coefficients = np.zeros(194)
	step_size = 5e-1
	batch_size = len(feature_matrix_train)
	max_iter = 200
	coefficients_batch, log_likelihood_batch = logistic_regression_SG(feature_matrix_train, sentiment_train,
	initial_coefficients, step_size, batch_size, max_iter)
	print log_likelihood_batch

	#compare stochastic gradient ascent and batch gradient ascent
	step_size = 1e-1
	batch_size = 100
	initial_coefficients = np.zeros(194)
	passes = 10
	coefficients_all, log_likelihood_all = logistic_regression_SG(feature_matrix_train, sentiment_train,
	initial_coefficients, step_size, batch_size, passes*len(feature_matrix_train)/batch_size)
	import matplotlib.pyplot as plt
	def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''):
	plt.rcParams.update({'figure.figsize':(9,5)})
	log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all),\
	np.ones((smoothing_window,))/smoothing_window, mode='valid')
	plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data,\
	log_likelihood_all_ma, linewidth=2.0, label=label)
	plt.xlabel('# of passes over data')
	plt.ylabel('Average log likelihood per data point')
	plt.show()
	make_plot(log_likelihood_all, len(feature_matrix_train), batch_size)