Created
December 25, 2016 23:20
-
-
Save shengch02/02eba77e3b765b94fcffd0fc39f45dff to your computer and use it in GitHub Desktop.
(Python) Use SFrames to do some feature engineering. Train a decision-tree on the LendingClub dataset. Visualize the tree. Predict whether a loan will default along with prediction probabilities (on a validation set). Train a complex tree model and compare it to simple tree model.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Identifying safe loans with decision trees | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#explore what features we have in the dataset | |
print loans.column_names() | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan | |
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1) | |
loans = loans.remove_column('bad_loans') | |
#explore the distribution of safe and risky loans | |
print (sum(loans['safe_loans']==1)+0.0)/len(loans) #0.811 | |
print (sum(loans['safe_loans']==-1)+0.0)/len(loans) #0.189 | |
#use a subset of features (categorical and numeric) | |
features = ['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', 'purpose', | |
'term', 'last_delinq_none', 'last_major_derog_none', 'revol_util', 'total_rec_late_fee'] | |
target = 'safe_loans' | |
loans = loans[features+[target]] | |
safe_loans_raw = loans[loans[target]==+1] | |
risky_loans_raw = loans[loans[target]==-1] | |
print 'Number of safe loans : %s' % len(safe_loans_raw) | |
print 'Number of risky loans : %s' % len(risky_loans_raw) | |
#combat class imbalance: undersample the large class until the class distribution is half and half | |
percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) | |
risky_loans = risky_loans_raw | |
safe_loans = safe_loans_raw.sample(percentage, seed=1) | |
loans_data = risky_loans.append(safe_loans) | |
#one-hot encoding: turn categorical variables into binary features | |
categorical_variables = [] | |
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()): | |
if feat_type == str: | |
categorical_variables.append(feat_name) | |
for feature in categorical_variables: | |
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1}) | |
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature) | |
for column in loans_data_unpacked.column_names(): | |
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0) | |
loans_data.remove_column(feature) | |
loans_data.add_columns(loans_data_unpacked) | |
#training-validation split: help to select model parameters | |
train_data, validation_data = loans_data.random_split(0.8, seed=1) | |
#use built-in scikit learn decision tree learner | |
train_data = sframe.SFrame.to_dataframe(train_data) | |
train_safe_loans = train_data['safe_loans'].as_matrix() | |
train_features = train_data | |
train_features = train_features.drop('safe_loans', 1).as_matrix() | |
from sklearn.tree import DecisionTreeClassifier | |
decision_tree_model = DecisionTreeClassifier(max_depth=6) | |
small_model = DecisionTreeClassifier(max_depth=2) | |
decision_tree_model.fit(train_features, train_safe_loans) | |
small_model.fit(train_features, train_safe_loans) | |
#visualize the trained tree: http://graphviz.readthedocs.io/en/latest/ | |
#make predictions | |
sample_validation_data_risky = validation_data[validation_data['safe_loans']==-1][0:2] | |
sample_validation_data_safe = validation_data[validation_data['safe_loans']==1][0:2] | |
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky) | |
sample_validation_data = sframe.SFrame.to_dataframe(sample_validation_data) | |
sample_features = sample_validation_data.drop('safe_loans', 1).as_matrix() | |
sample_predict = decision_tree_model.predict(sample_features) #0.5 accuracy | |
sample_predict_probs = decision_tree_model.predict_proba(sample_features) | |
sample_predict_small_model = small_model.predict(sample_features) #0.5 accuracy | |
sample_predict_probs_small_model = small_model.predict_proba(sample_features) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment