Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created January 5, 2017 02:06
Show Gist options
  • Save shengch02/78343c067fc2917d2a08a89f870d5d5c to your computer and use it in GitHub Desktop.
Save shengch02/78343c067fc2917d2a08a89f870d5d5c to your computer and use it in GitHub Desktop.
(Python) Implement stochastic gradient ascent with L2 penalty. Compare convergence of stochastic gradient ascent with that of batch gradient ascent.
#Training logistic regression via stochastic gradient ascent
import math
import pandas as pd
import numpy as np
#the dataset consists a subset of baby product reviews on Amazon.com
import sframe
products = sframe.SFrame('amazon_baby_subset.gl/')
products = sframe.SFrame.to_dataframe(products)
#load 193 most frequent words from a JSON file
import json
with open('important_words.json') as json_data:
important_words = json.load(json_data)
important_words = [str(x) for x in important_words]
#fill n/a values
products['review'] = products['review'].fillna('')
#remove punctuations
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
#count the occurance of the words
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
#split the data into a 90-10 split with 90% in the training set
products=sframe.SFrame(products)
train_data, validation_data = products.random_split(0.9, seed=1)
#convert the dataframe to a multi-dimensional array
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant']+features
features_frame = sframe.SFrame.to_dataframe(dataframe[features])
feature_matrix = features_frame.as_matrix()
return(feature_matrix, np.array(dataframe[label]))
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
#194 features(including intercept)
#estimate conditional probability with link function
def predict_probability(feature_matrix, coefficients):
#dot product of feature and coefficients
score = np.dot(feature_matrix, coefficients)
#compute probability using the link function
predictions = 1.0/(1.0+np.exp(-score))
return predictions
#compute derivative of log likelihood with respect to a single coefficient
def feature_derivative(errors, feature):
derivative = np.dot(errors, feature)
return derivative
#compute average log-likelihood
def compute_avg_log_likelihood(feature_matrix, sentiment, coefficients):
indicator = (sentiment==+1)
scores = np.dot(feature_matrix, coefficients)
logexp = np.log(1.0+np.exp(-scores))
#simple check to prevent overflow
mask = np.isinf(logexp)
logexp[mask] = -scores[mask]
lp = np.sum((indicator-1)*scores-logexp)/float(len(feature_matrix))
return lp
#compute the gradient for a single data point
j = 1
i = 10
coefficients = np.zeros(194)
predictions = predict_probability(feature_matrix_train[i:i+1,:], coefficients)
indicator = (sentiment_train[i:i+1]==1)
errors = indicator-predictions
gradient_single_data_point = feature_derivative(errors, feature_matrix_train[i:i+1,j])
print 'Gradient single data point: %s' % gradient_single_data_point
#modifying the derivative for using a batch of data points
j = 1
i = 10
B = 10
coefficients = np.zeros(194)
predictions = predict_probability(feature_matrix_train[i:i+B,:], coefficients)
indicator = (sentiment_train[i:i+B]==1)
errors = indicator - predictions
gradient_mini_batch = feature_derivative(errors, feature_matrix_train[i:i+B,j])
print 'Gradient mini-batch data points: %s' % gradient_mini_batch
#implement stochastic gradient ascent
def logistic_regression_SG(feature_matrix, sentiment, initial_coefficients, step_size,
batch_size, max_iter):
log_likelihood_all = []
coefficients = np.array(initial_coefficients)
np.random.seed(seed=1)
permutation = np.random.permutation(len(feature_matrix))
feature_matrix = feature_matrix[permutation,:]
sentiment = sentiment[permutation]
i = 0
for itr in xrange(max_iter):
predictions = predict_probability(feature_matrix[i:i+batch_size,:], coefficients)
indicator = (sentiment[i:i+batch_size]==1)
errors = indicator - predictions
for j in xrange(len(coefficients)):
derivative = feature_derivative(errors, feature_matrix[i:i+batch_size,j])
coefficients[j] += derivative*step_size/batch_size
lp = compute_avg_log_likelihood(feature_matrix[i:i+batch_size,:], sentiment[i:i+batch_size],
coefficients)
log_likelihood_all.append(lp)
if itr<= 15 or (itr<=1000 and itr%100==0) or (itr<=10000 and itr%1000==0) or \
itr%10000==0 or itr==max_iter-1:
data_size = len(feature_matrix)
print 'Iteration %*d: Average log likelihood %0*d : %0*d = %.8f' % \
(int(np.ceil(np.log10(max_iter))), itr, \
int(np.ceil(np.log10(data_size))), i, \
int(np.ceil(np.log10(data_size))), i+batch_size, lp)
i += batch_size
if (i+batch_size)>len(feature_matrix):
permutation = np.random.permutation(len(feature_matrix))
feature_matrix = feature_matrix[permutation,:]
sentiment = sentiment[permutation]
i = 0
return coefficients, log_likelihood_all
# stochastic gradient ascent
initial_coefficients = np.zeros(194)
step_size = 5e-1
batch_size = 1
max_iter = 10
coefficients_sg, log_likelihood_sg = logistic_regression_SG(feature_matrix_train, sentiment_train,
initial_coefficients, step_size, batch_size, max_iter)
print log_likelihood_sg
#batch gradient ascent
initial_coefficients = np.zeros(194)
step_size = 5e-1
batch_size = len(feature_matrix_train)
max_iter = 200
coefficients_batch, log_likelihood_batch = logistic_regression_SG(feature_matrix_train, sentiment_train,
initial_coefficients, step_size, batch_size, max_iter)
print log_likelihood_batch
#compare stochastic gradient ascent and batch gradient ascent
step_size = 1e-1
batch_size = 100
initial_coefficients = np.zeros(194)
passes = 10
coefficients_all, log_likelihood_all = logistic_regression_SG(feature_matrix_train, sentiment_train,
initial_coefficients, step_size, batch_size, passes*len(feature_matrix_train)/batch_size)
import matplotlib.pyplot as plt
def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''):
plt.rcParams.update({'figure.figsize':(9,5)})
log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all),\
np.ones((smoothing_window,))/smoothing_window, mode='valid')
plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data,\
log_likelihood_all_ma, linewidth=2.0, label=label)
plt.xlabel('# of passes over data')
plt.ylabel('Average log likelihood per data point')
plt.show()
make_plot(log_likelihood_all, len(feature_matrix_train), batch_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment