Created
January 2, 2017 01:29
-
-
Save shengch02/10f7b0ab3695ecae82e8102b655de3cb to your computer and use it in GitHub Desktop.
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities (on a validation set). Find the most positive and negative loans using the learned model. Explore how the number of trees influences classification performance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#use the pre-implemented gradient boosted trees | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan | |
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1) | |
loans = loans.remove_column('bad_loans') | |
#use a subset of features (categorical and numeric) | |
features = ['grade', 'sub_grade_num', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', | |
'purpose', 'payment_inc_ratio', 'delinq_2yrs', 'delinq_2yrs_zero', 'inq_last_6mths', | |
'last_delinq_none', 'last_major_derog_none', 'open_acc', 'pub_rec', 'pub_rec_zero', | |
'revol_util', 'total_rec_late_fee', 'int_rate', 'total_rec_int', 'annual_inc', | |
'funded_amnt', 'funded_amnt_inv', 'installment'] | |
target = 'safe_loans' | |
loans, loans_with_na = loans[features+[target]].dropna_split() | |
#Count the number of rows with missing data | |
num_rows_with_na = loans_with_na.num_rows() | |
num_rows = loans.num_rows() | |
print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows) | |
#undersample the larger class in order to balance our dataset | |
safe_loans_raw = loans[loans[target]==1] | |
risky_loans_raw = loans[loans[target]==-1] | |
percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) | |
safe_loans = safe_loans_raw.sample(percentage, seed=1) | |
risky_loans = risky_loans_raw | |
loans_data = risky_loans.append(safe_loans) | |
print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data)) | |
print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data)) | |
print 'Total number of loans :', len(loans_data) | |
#One-hot encoding | |
categorical_variables = [] | |
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()): | |
if feat_type == str: | |
categorical_variables.append(feat_name) | |
for feature in categorical_variables: | |
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1}) | |
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature) | |
for column in loans_data_unpacked.column_names(): | |
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0) | |
loans_data.remove_column(feature) | |
loans_data.add_columns(loans_data_unpacked) | |
#split data into training and validation | |
train_data, validation_data = loans_data.random_split(0.8, seed=1) | |
#built-in scikit learn gradient boosting classifier | |
train_data_pd = sframe.SFrame.to_dataframe(train_data) | |
train_target = train_data_pd['safe_loans'].as_matrix() | |
train_features = train_data_pd.drop('safe_loans', 1).as_matrix() | |
from sklearn.ensemble import GradientBoostingClassifier | |
grd = GradientBoostingClassifier(n_estimators=5, max_depth=6) | |
grd.fit(train_features, train_target) | |
#make predictions | |
validation_safe_loans = validation_data[validation_data[target]==1] | |
validation_risky_loans = validation_data[validation_data[target]==-1] | |
sample_validation_data_risky = validation_risky_loans[0:2] | |
sample_validation_data_safe = validation_safe_loans[0:2] | |
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky) | |
sample_validation_pd = sframe.SFrame.to_dataframe(sample_validation_data) | |
sample_validation_features = sample_validation_pd.drop('safe_loans', 1).as_matrix() | |
sample_predict = grd.predict(sample_validation_features) #1, 1, -1, 1 vs. 1, 1, -1, -1 for true results | |
sample_pos_prob = grd.predict_proba(sample_validation_features)[:,1] #[ 0.58357669, 0.53050311, 0.46192208, 0.60408361] | |
#evaluate the accuracy of on the validation_data | |
validation_data_pd = sframe.SFrame.to_dataframe(validation_data) | |
validation_features = validation_data_pd.drop(target, 1).as_matrix() | |
validation_predict = grd.predict(validation_features) | |
validation_data.add_column(sframe.SArray(validation_predict), name='predict') | |
print sum(validation_data[target]==validation_data['predict'])/float(len(validation_data)) #0.66146 | |
print sum(((validation_data[target]==-1) + (validation_data['predict']==1))==2) #1652 | |
print sum(((validation_data[target]==1) + (validation_data['predict']==-1))==2) #1491 | |
prob = grd.predict_proba(validation_features) # probability of negative & positive prediction | |
validation_data.add_column(sframe.SArray(prob[:,0]), name='negprob') | |
validation_data.add_column(sframe.SArray(prob[:,1]), name='posprob') | |
print validation_data.sort('negprob', ascending=False).head(5) # top 20 negative prediction | |
print validation_data.sort('posprob', ascending=False).head(5) # top 20 positive prediction | |
#effects of adding more trees | |
model_10 = GradientBoostingClassifier(n_estimators=10, max_depth=6) | |
model_50 = GradientBoostingClassifier(n_estimators=50, max_depth=6) | |
model_100 = GradientBoostingClassifier(n_estimators=100, max_depth=6) | |
model_200 = GradientBoostingClassifier(n_estimators=200, max_depth=6) | |
model_500 = GradientBoostingClassifier(n_estimators=500, max_depth=6) | |
model_10.fit(train_features, train_target) | |
model_50.fit(train_features, train_target) | |
model_100.fit(train_features, train_target) | |
model_200.fit(train_features, train_target) | |
model_500.fit(train_features, train_target) | |
validation_data.add_column(sframe.SArray(model_10.predict(validation_features)), name='predict_10') | |
validation_data.add_column(sframe.SArray(model_50.predict(validation_features)), name='predict_50') | |
validation_data.add_column(sframe.SArray(model_100.predict(validation_features)), name='predict_100') | |
validation_data.add_column(sframe.SArray(model_200.predict(validation_features)), name='predict_200') | |
validation_data.add_column(sframe.SArray(model_500.predict(validation_features)), name='predict_500') | |
print sum(validation_data[target]==validation_data['predict_10'])/float(len(validation_data)) #0.6662 | |
print sum(validation_data[target]==validation_data['predict_50'])/float(len(validation_data)) #0.6845 | |
print sum(validation_data[target]==validation_data['predict_100'])/float(len(validation_data)) #0.6897 | |
print sum(validation_data[target]==validation_data['predict_200'])/float(len(validation_data)) #0.6859 | |
print sum(validation_data[target]==validation_data['predict_500'])/float(len(validation_data)) #0.6874 | |
train_data.add_column(sframe.SArray(model_10.predict(train_features)), name='predict_10') | |
train_data.add_column(sframe.SArray(model_50.predict(train_features)), name='predict_50') | |
train_data.add_column(sframe.SArray(model_100.predict(train_features)), name='predict_100') | |
train_data.add_column(sframe.SArray(model_200.predict(train_features)), name='predict_200') | |
train_data.add_column(sframe.SArray(model_500.predict(train_features)), name='predict_500') | |
print sum(train_data[target]==train_data['predict_10'])/float(len(train_data)) #0.6717 | |
print sum(train_data[target]==train_data['predict_50'])/float(len(train_data)) #0.7173 | |
print sum(train_data[target]==train_data['predict_100'])/float(len(train_data)) #0.7466 | |
print sum(train_data[target]==train_data['predict_200'])/float(len(train_data)) #0.7873 | |
print sum(train_data[target]==train_data['predict_500'])/float(len(train_data)) #0.8654 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment