shengch02 · December 25, 2016 23:20
diff --git a/Identifying safe loans with decision trees b/Identifying safe loans with decision trees
 #Identifying safe loans with decision trees
 import math
 import pandas as pd
 import numpy as np

 #the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or 
 #the loan with be charged off and possibly go into default
 import sframe
 loans = sframe.SFrame('lending-club-data.gl/')

 #explore what features we have in the dataset
 print loans.column_names()

 #target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
 loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
 loans = loans.remove_column('bad_loans')

 #explore the distribution of safe and risky loans
 print (sum(loans['safe_loans']==1)+0.0)/len(loans)  #0.811
 print (sum(loans['safe_loans']==-1)+0.0)/len(loans) #0.189

 #use a subset of features (categorical and numeric)
 features = ['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', 'purpose', 
 	'term', 'last_delinq_none', 'last_major_derog_none', 'revol_util', 'total_rec_late_fee']
 target = 'safe_loans'
 loans = loans[features+[target]]
 safe_loans_raw = loans[loans[target]==+1]
 risky_loans_raw = loans[loans[target]==-1]
 print 'Number of safe loans : %s' % len(safe_loans_raw)
 print 'Number of risky loans : %s' % len(risky_loans_raw)

 #combat class imbalance: undersample the large class until the class distribution is half and half
 percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
 risky_loans = risky_loans_raw
 safe_loans = safe_loans_raw.sample(percentage, seed=1)
 loans_data = risky_loans.append(safe_loans)

 #one-hot encoding: turn categorical variables into binary features
 categorical_variables = []
 for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
 	if feat_type == str:
 		categorical_variables.append(feat_name)
 for feature in categorical_variables:
 	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
 	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
 	for column in loans_data_unpacked.column_names():
 		loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
 	loans_data.remove_column(feature)
 	loans_data.add_columns(loans_data_unpacked)

 #training-validation split: help to select model parameters
 train_data, validation_data = loans_data.random_split(0.8, seed=1)

 #use built-in scikit learn decision tree learner
 train_data = sframe.SFrame.to_dataframe(train_data)
 train_safe_loans = train_data['safe_loans'].as_matrix()
 train_features = train_data
 train_features = train_features.drop('safe_loans', 1).as_matrix()
 from sklearn.tree import DecisionTreeClassifier
 decision_tree_model = DecisionTreeClassifier(max_depth=6)
 small_model = DecisionTreeClassifier(max_depth=2)
 decision_tree_model.fit(train_features, train_safe_loans)
 small_model.fit(train_features, train_safe_loans)

 #visualize the trained tree: http://graphviz.readthedocs.io/en/latest/
 #make predictions
 sample_validation_data_risky = validation_data[validation_data['safe_loans']==-1][0:2]
 sample_validation_data_safe = validation_data[validation_data['safe_loans']==1][0:2]
 sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
 sample_validation_data = sframe.SFrame.to_dataframe(sample_validation_data)
 sample_features = sample_validation_data.drop('safe_loans', 1).as_matrix()
 sample_predict = decision_tree_model.predict(sample_features)   #0.5 accuracy
 sample_predict_probs = decision_tree_model.predict_proba(sample_features)
 sample_predict_small_model = small_model.predict(sample_features)   #0.5 accuracy
 sample_predict_probs_small_model = small_model.predict_proba(sample_features)
	#Identifying safe loans with decision trees
	import math
	import pandas as pd
	import numpy as np

	#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
	#the loan with be charged off and possibly go into default
	import sframe
	loans = sframe.SFrame('lending-club-data.gl/')

	#explore what features we have in the dataset
	print loans.column_names()

	#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
	loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
	loans = loans.remove_column('bad_loans')

	#explore the distribution of safe and risky loans
	print (sum(loans['safe_loans']==1)+0.0)/len(loans) #0.811
	print (sum(loans['safe_loans']==-1)+0.0)/len(loans) #0.189

	#use a subset of features (categorical and numeric)
	features = ['grade', 'sub_grade', 'short_emp', 'emp_length_num', 'home_ownership', 'dti', 'purpose',
	'term', 'last_delinq_none', 'last_major_derog_none', 'revol_util', 'total_rec_late_fee']
	target = 'safe_loans'
	loans = loans[features+[target]]
	safe_loans_raw = loans[loans[target]==+1]
	risky_loans_raw = loans[loans[target]==-1]
	print 'Number of safe loans : %s' % len(safe_loans_raw)
	print 'Number of risky loans : %s' % len(risky_loans_raw)

	#combat class imbalance: undersample the large class until the class distribution is half and half
	percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
	risky_loans = risky_loans_raw
	safe_loans = safe_loans_raw.sample(percentage, seed=1)
	loans_data = risky_loans.append(safe_loans)

	#one-hot encoding: turn categorical variables into binary features
	categorical_variables = []
	for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
	if feat_type == str:
	categorical_variables.append(feat_name)
	for feature in categorical_variables:
	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
	for column in loans_data_unpacked.column_names():
	loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
	loans_data.remove_column(feature)
	loans_data.add_columns(loans_data_unpacked)

	#training-validation split: help to select model parameters
	train_data, validation_data = loans_data.random_split(0.8, seed=1)

	#use built-in scikit learn decision tree learner
	train_data = sframe.SFrame.to_dataframe(train_data)
	train_safe_loans = train_data['safe_loans'].as_matrix()
	train_features = train_data
	train_features = train_features.drop('safe_loans', 1).as_matrix()
	from sklearn.tree import DecisionTreeClassifier
	decision_tree_model = DecisionTreeClassifier(max_depth=6)
	small_model = DecisionTreeClassifier(max_depth=2)
	decision_tree_model.fit(train_features, train_safe_loans)
	small_model.fit(train_features, train_safe_loans)

	#visualize the trained tree: http://graphviz.readthedocs.io/en/latest/
	#make predictions
	sample_validation_data_risky = validation_data[validation_data['safe_loans']==-1][0:2]
	sample_validation_data_safe = validation_data[validation_data['safe_loans']==1][0:2]
	sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
	sample_validation_data = sframe.SFrame.to_dataframe(sample_validation_data)
	sample_features = sample_validation_data.drop('safe_loans', 1).as_matrix()
	sample_predict = decision_tree_model.predict(sample_features) #0.5 accuracy
	sample_predict_probs = decision_tree_model.predict_proba(sample_features)
	sample_predict_small_model = small_model.predict(sample_features) #0.5 accuracy
	sample_predict_probs_small_model = small_model.predict_proba(sample_features)