Created
January 5, 2017 02:06
-
-
Save shengch02/78343c067fc2917d2a08a89f870d5d5c to your computer and use it in GitHub Desktop.
(Python) Implement stochastic gradient ascent with L2 penalty. Compare convergence of stochastic gradient ascent with that of batch gradient ascent.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Training logistic regression via stochastic gradient ascent | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
products = sframe.SFrame.to_dataframe(products) | |
#load 193 most frequent words from a JSON file | |
import json | |
with open('important_words.json') as json_data: | |
important_words = json.load(json_data) | |
important_words = [str(x) for x in important_words] | |
#fill n/a values | |
products['review'] = products['review'].fillna('') | |
#remove punctuations | |
def remove_punctuation(text): | |
import string | |
return text.translate(None, string.punctuation) | |
products['review_clean'] = products['review'].apply(remove_punctuation) | |
#count the occurance of the words | |
for word in important_words: | |
products[word] = products['review_clean'].apply(lambda s : s.split().count(word)) | |
#split the data into a 90-10 split with 90% in the training set | |
products=sframe.SFrame(products) | |
train_data, validation_data = products.random_split(0.9, seed=1) | |
#convert the dataframe to a multi-dimensional array | |
def get_numpy_data(dataframe, features, label): | |
dataframe['constant'] = 1 | |
features = ['constant']+features | |
features_frame = sframe.SFrame.to_dataframe(dataframe[features]) | |
feature_matrix = features_frame.as_matrix() | |
return(feature_matrix, np.array(dataframe[label])) | |
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment') | |
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') | |
#194 features(including intercept) | |
#estimate conditional probability with link function | |
def predict_probability(feature_matrix, coefficients): | |
#dot product of feature and coefficients | |
score = np.dot(feature_matrix, coefficients) | |
#compute probability using the link function | |
predictions = 1.0/(1.0+np.exp(-score)) | |
return predictions | |
#compute derivative of log likelihood with respect to a single coefficient | |
def feature_derivative(errors, feature): | |
derivative = np.dot(errors, feature) | |
return derivative | |
#compute average log-likelihood | |
def compute_avg_log_likelihood(feature_matrix, sentiment, coefficients): | |
indicator = (sentiment==+1) | |
scores = np.dot(feature_matrix, coefficients) | |
logexp = np.log(1.0+np.exp(-scores)) | |
#simple check to prevent overflow | |
mask = np.isinf(logexp) | |
logexp[mask] = -scores[mask] | |
lp = np.sum((indicator-1)*scores-logexp)/float(len(feature_matrix)) | |
return lp | |
#compute the gradient for a single data point | |
j = 1 | |
i = 10 | |
coefficients = np.zeros(194) | |
predictions = predict_probability(feature_matrix_train[i:i+1,:], coefficients) | |
indicator = (sentiment_train[i:i+1]==1) | |
errors = indicator-predictions | |
gradient_single_data_point = feature_derivative(errors, feature_matrix_train[i:i+1,j]) | |
print 'Gradient single data point: %s' % gradient_single_data_point | |
#modifying the derivative for using a batch of data points | |
j = 1 | |
i = 10 | |
B = 10 | |
coefficients = np.zeros(194) | |
predictions = predict_probability(feature_matrix_train[i:i+B,:], coefficients) | |
indicator = (sentiment_train[i:i+B]==1) | |
errors = indicator - predictions | |
gradient_mini_batch = feature_derivative(errors, feature_matrix_train[i:i+B,j]) | |
print 'Gradient mini-batch data points: %s' % gradient_mini_batch | |
#implement stochastic gradient ascent | |
def logistic_regression_SG(feature_matrix, sentiment, initial_coefficients, step_size, | |
batch_size, max_iter): | |
log_likelihood_all = [] | |
coefficients = np.array(initial_coefficients) | |
np.random.seed(seed=1) | |
permutation = np.random.permutation(len(feature_matrix)) | |
feature_matrix = feature_matrix[permutation,:] | |
sentiment = sentiment[permutation] | |
i = 0 | |
for itr in xrange(max_iter): | |
predictions = predict_probability(feature_matrix[i:i+batch_size,:], coefficients) | |
indicator = (sentiment[i:i+batch_size]==1) | |
errors = indicator - predictions | |
for j in xrange(len(coefficients)): | |
derivative = feature_derivative(errors, feature_matrix[i:i+batch_size,j]) | |
coefficients[j] += derivative*step_size/batch_size | |
lp = compute_avg_log_likelihood(feature_matrix[i:i+batch_size,:], sentiment[i:i+batch_size], | |
coefficients) | |
log_likelihood_all.append(lp) | |
if itr<= 15 or (itr<=1000 and itr%100==0) or (itr<=10000 and itr%1000==0) or \ | |
itr%10000==0 or itr==max_iter-1: | |
data_size = len(feature_matrix) | |
print 'Iteration %*d: Average log likelihood %0*d : %0*d = %.8f' % \ | |
(int(np.ceil(np.log10(max_iter))), itr, \ | |
int(np.ceil(np.log10(data_size))), i, \ | |
int(np.ceil(np.log10(data_size))), i+batch_size, lp) | |
i += batch_size | |
if (i+batch_size)>len(feature_matrix): | |
permutation = np.random.permutation(len(feature_matrix)) | |
feature_matrix = feature_matrix[permutation,:] | |
sentiment = sentiment[permutation] | |
i = 0 | |
return coefficients, log_likelihood_all | |
# stochastic gradient ascent | |
initial_coefficients = np.zeros(194) | |
step_size = 5e-1 | |
batch_size = 1 | |
max_iter = 10 | |
coefficients_sg, log_likelihood_sg = logistic_regression_SG(feature_matrix_train, sentiment_train, | |
initial_coefficients, step_size, batch_size, max_iter) | |
print log_likelihood_sg | |
#batch gradient ascent | |
initial_coefficients = np.zeros(194) | |
step_size = 5e-1 | |
batch_size = len(feature_matrix_train) | |
max_iter = 200 | |
coefficients_batch, log_likelihood_batch = logistic_regression_SG(feature_matrix_train, sentiment_train, | |
initial_coefficients, step_size, batch_size, max_iter) | |
print log_likelihood_batch | |
#compare stochastic gradient ascent and batch gradient ascent | |
step_size = 1e-1 | |
batch_size = 100 | |
initial_coefficients = np.zeros(194) | |
passes = 10 | |
coefficients_all, log_likelihood_all = logistic_regression_SG(feature_matrix_train, sentiment_train, | |
initial_coefficients, step_size, batch_size, passes*len(feature_matrix_train)/batch_size) | |
import matplotlib.pyplot as plt | |
def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''): | |
plt.rcParams.update({'figure.figsize':(9,5)}) | |
log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all),\ | |
np.ones((smoothing_window,))/smoothing_window, mode='valid') | |
plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data,\ | |
log_likelihood_all_ma, linewidth=2.0, label=label) | |
plt.xlabel('# of passes over data') | |
plt.ylabel('Average log likelihood per data point') | |
plt.show() | |
make_plot(log_likelihood_all, len(feature_matrix_train), batch_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment