Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created December 25, 2016 23:20
Show Gist options
  • Save shengch02/02eba77e3b765b94fcffd0fc39f45dff to your computer and use it in GitHub Desktop.
Save shengch02/02eba77e3b765b94fcffd0fc39f45dff to your computer and use it in GitHub Desktop.
(Python) Use SFrames to do some feature engineering. Train a decision-tree on the LendingClub dataset. Visualize the tree. Predict whether a loan will default along with prediction probabilities (on a validation set). Train a complex tree model and compare it to simple tree model.
#Identifying safe loans with decision trees
import math
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#explore what features we have in the dataset
print loans.column_names()
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
#explore the distribution of safe and risky loans
print (sum(loans['safe_loans']==1)+0.0)/len(loans) #0.811
print (sum(loans['safe_loans']==-1)+0.0)/len(loans) #0.189
#use a subset of features (categorical and numeric)
features = ['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', 'purpose',
'term', 'last_delinq_none', 'last_major_derog_none', 'revol_util', 'total_rec_late_fee']
target = 'safe_loans'
loans = loans[features+[target]]
safe_loans_raw = loans[loans[target]==+1]
risky_loans_raw = loans[loans[target]==-1]
print 'Number of safe loans : %s' % len(safe_loans_raw)
print 'Number of risky loans : %s' % len(risky_loans_raw)
#combat class imbalance: undersample the large class until the class distribution is half and half
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)
loans_data = risky_loans.append(safe_loans)
#one-hot encoding: turn categorical variables into binary features
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
if feat_type == str:
categorical_variables.append(feat_name)
for feature in categorical_variables:
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
for column in loans_data_unpacked.column_names():
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
loans_data.remove_column(feature)
loans_data.add_columns(loans_data_unpacked)
#training-validation split: help to select model parameters
train_data, validation_data = loans_data.random_split(0.8, seed=1)
#use built-in scikit learn decision tree learner
train_data = sframe.SFrame.to_dataframe(train_data)
train_safe_loans = train_data['safe_loans'].as_matrix()
train_features = train_data
train_features = train_features.drop('safe_loans', 1).as_matrix()
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)
decision_tree_model.fit(train_features, train_safe_loans)
small_model.fit(train_features, train_safe_loans)
#visualize the trained tree: http://graphviz.readthedocs.io/en/latest/
#make predictions
sample_validation_data_risky = validation_data[validation_data['safe_loans']==-1][0:2]
sample_validation_data_safe = validation_data[validation_data['safe_loans']==1][0:2]
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data = sframe.SFrame.to_dataframe(sample_validation_data)
sample_features = sample_validation_data.drop('safe_loans', 1).as_matrix()
sample_predict = decision_tree_model.predict(sample_features) #0.5 accuracy
sample_predict_probs = decision_tree_model.predict_proba(sample_features)
sample_predict_small_model = small_model.predict(sample_features) #0.5 accuracy
sample_predict_probs_small_model = small_model.predict_proba(sample_features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment