Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created January 2, 2017 01:29
Show Gist options
  • Save shengch02/10f7b0ab3695ecae82e8102b655de3cb to your computer and use it in GitHub Desktop.
Save shengch02/10f7b0ab3695ecae82e8102b655de3cb to your computer and use it in GitHub Desktop.
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities (on a validation set). Find the most positive and negative loans using the learned model. Explore how the number of trees influences classification performance.
#use the pre-implemented gradient boosted trees
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
#use a subset of features (categorical and numeric)
features = ['grade', 'sub_grade_num', 'short_emp', 'emp_length_num', 'home_ownership', 'dti',
'purpose', 'payment_inc_ratio', 'delinq_2yrs', 'delinq_2yrs_zero', 'inq_last_6mths',
'last_delinq_none', 'last_major_derog_none', 'open_acc', 'pub_rec', 'pub_rec_zero',
'revol_util', 'total_rec_late_fee', 'int_rate', 'total_rec_int', 'annual_inc',
'funded_amnt', 'funded_amnt_inv', 'installment']
target = 'safe_loans'
loans, loans_with_na = loans[features+[target]].dropna_split()
#Count the number of rows with missing data
num_rows_with_na = loans_with_na.num_rows()
num_rows = loans.num_rows()
print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows)
#undersample the larger class in order to balance our dataset
safe_loans_raw = loans[loans[target]==1]
risky_loans_raw = loans[loans[target]==-1]
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed=1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)
print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data))
print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data))
print 'Total number of loans :', len(loans_data)
#One-hot encoding
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
if feat_type == str:
categorical_variables.append(feat_name)
for feature in categorical_variables:
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
for column in loans_data_unpacked.column_names():
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
loans_data.remove_column(feature)
loans_data.add_columns(loans_data_unpacked)
#split data into training and validation
train_data, validation_data = loans_data.random_split(0.8, seed=1)
#built-in scikit learn gradient boosting classifier
train_data_pd = sframe.SFrame.to_dataframe(train_data)
train_target = train_data_pd['safe_loans'].as_matrix()
train_features = train_data_pd.drop('safe_loans', 1).as_matrix()
from sklearn.ensemble import GradientBoostingClassifier
grd = GradientBoostingClassifier(n_estimators=5, max_depth=6)
grd.fit(train_features, train_target)
#make predictions
validation_safe_loans = validation_data[validation_data[target]==1]
validation_risky_loans = validation_data[validation_data[target]==-1]
sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_pd = sframe.SFrame.to_dataframe(sample_validation_data)
sample_validation_features = sample_validation_pd.drop('safe_loans', 1).as_matrix()
sample_predict = grd.predict(sample_validation_features) #1, 1, -1, 1 vs. 1, 1, -1, -1 for true results
sample_pos_prob = grd.predict_proba(sample_validation_features)[:,1] #[ 0.58357669, 0.53050311, 0.46192208, 0.60408361]
#evaluate the accuracy of on the validation_data
validation_data_pd = sframe.SFrame.to_dataframe(validation_data)
validation_features = validation_data_pd.drop(target, 1).as_matrix()
validation_predict = grd.predict(validation_features)
validation_data.add_column(sframe.SArray(validation_predict), name='predict')
print sum(validation_data[target]==validation_data['predict'])/float(len(validation_data)) #0.66146
print sum(((validation_data[target]==-1) + (validation_data['predict']==1))==2) #1652
print sum(((validation_data[target]==1) + (validation_data['predict']==-1))==2) #1491
prob = grd.predict_proba(validation_features) # probability of negative & positive prediction
validation_data.add_column(sframe.SArray(prob[:,0]), name='negprob')
validation_data.add_column(sframe.SArray(prob[:,1]), name='posprob')
print validation_data.sort('negprob', ascending=False).head(5) # top 20 negative prediction
print validation_data.sort('posprob', ascending=False).head(5) # top 20 positive prediction
#effects of adding more trees
model_10 = GradientBoostingClassifier(n_estimators=10, max_depth=6)
model_50 = GradientBoostingClassifier(n_estimators=50, max_depth=6)
model_100 = GradientBoostingClassifier(n_estimators=100, max_depth=6)
model_200 = GradientBoostingClassifier(n_estimators=200, max_depth=6)
model_500 = GradientBoostingClassifier(n_estimators=500, max_depth=6)
model_10.fit(train_features, train_target)
model_50.fit(train_features, train_target)
model_100.fit(train_features, train_target)
model_200.fit(train_features, train_target)
model_500.fit(train_features, train_target)
validation_data.add_column(sframe.SArray(model_10.predict(validation_features)), name='predict_10')
validation_data.add_column(sframe.SArray(model_50.predict(validation_features)), name='predict_50')
validation_data.add_column(sframe.SArray(model_100.predict(validation_features)), name='predict_100')
validation_data.add_column(sframe.SArray(model_200.predict(validation_features)), name='predict_200')
validation_data.add_column(sframe.SArray(model_500.predict(validation_features)), name='predict_500')
print sum(validation_data[target]==validation_data['predict_10'])/float(len(validation_data)) #0.6662
print sum(validation_data[target]==validation_data['predict_50'])/float(len(validation_data)) #0.6845
print sum(validation_data[target]==validation_data['predict_100'])/float(len(validation_data)) #0.6897
print sum(validation_data[target]==validation_data['predict_200'])/float(len(validation_data)) #0.6859
print sum(validation_data[target]==validation_data['predict_500'])/float(len(validation_data)) #0.6874
train_data.add_column(sframe.SArray(model_10.predict(train_features)), name='predict_10')
train_data.add_column(sframe.SArray(model_50.predict(train_features)), name='predict_50')
train_data.add_column(sframe.SArray(model_100.predict(train_features)), name='predict_100')
train_data.add_column(sframe.SArray(model_200.predict(train_features)), name='predict_200')
train_data.add_column(sframe.SArray(model_500.predict(train_features)), name='predict_500')
print sum(train_data[target]==train_data['predict_10'])/float(len(train_data)) #0.6717
print sum(train_data[target]==train_data['predict_50'])/float(len(train_data)) #0.7173
print sum(train_data[target]==train_data['predict_100'])/float(len(train_data)) #0.7466
print sum(train_data[target]==train_data['predict_200'])/float(len(train_data)) #0.7873
print sum(train_data[target]==train_data['predict_500'])/float(len(train_data)) #0.8654
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment