Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save shengch02/5062118c1b338ae61c65df1fa69776d1 to your computer and use it in GitHub Desktop.
Save shengch02/5062118c1b338ae61c65df1fa69776d1 to your computer and use it in GitHub Desktop.
(Python) Implement binary decision trees with different early stopping methods. Compare models with different stopping parameters.
#explore various techniques for preventing overfitting in decision trees
import math
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
#use a subset of features (categorical and numeric)
features = ['grade', 'emp_length', 'home_ownership', 'term']
target = 'safe_loans'
loans = loans[features+[target]]
#combat class imbalance: undersample the large class until the class distribution is half and half
safe_loans_raw = loans[loans[target]==+1]
risky_loans_raw = loans[loans[target]==-1]
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)
loans_data = risky_loans.append(safe_loans)
#one-hot encoding: turn categorical variables into binary features
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
if feat_type == str:
categorical_variables.append(feat_name)
for feature in categorical_variables:
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
for column in loans_data_unpacked.column_names():
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
loans_data.remove_column(feature)
loans_data.add_columns(loans_data_unpacked)
#training-test split
train_data, test_data = loans_data.random_split(0.8, seed=1)
#early stopping methods for decision trees
#calculate whether the number of data points is less than or equal
#to the minimum node size
def reached_minimum_node_size(data, min_node_size):
return len(data)<=min_node_size
#compute the gain in error reduction
def error_reduction(effor_before_split, error_after_split):
return effor_before_split-error_after_split
#compute the number of misclassified examples
def intermediate_node_num_mistakes(labels_in_node):
if len(labels_in_node) == 0:
return 0
else:
num_pos = sum(labels_in_node==1)
num_neg = sum(labels_in_node==-1)
return min(num_pos, num_neg)
#function to pick best feature to split on
def best_splitting_feature(data, features, target):
target_values = data[target]
best_feature = None
best_error = 10
num_data_points = float(len(data))
for feature in features:
left_split = data[data[feature]==0]
right_split = data[data[feature]==1]
left_mistakes = intermediate_node_num_mistakes(left_split[target])
right_mistakes = intermediate_node_num_mistakes(right_split[target])
error = (left_mistakes+right_mistakes)/num_data_points
if error < best_error:
best_error = error
best_feature = feature
return best_feature
#a function to create a leaf node
def create_leaf(target_values):
leaf = {'splitting_feature' : None,
'left':None,
'right':None,
'is_leaf':True
}
num_ones = len(target_values[target_values==1])
num_minus_ones = len(target_values[target_values==-1])
if num_ones > num_minus_ones:
leaf['prediction'] = 1
else:
leaf['prediction'] = -1
return leaf
#build the tree
def decision_tree_create(data, features, target, current_depth=0, max_depth=10, min_node_size=1,
min_error_reduction=0.0):
remaining_features = features[:]
target_values = data[target]
print '-----------------------------------------------------'
print 'Subtree, depth = %s (%s data points).' %(current_depth, len(target_values))
#stop condition 1, all points are from the same class
if intermediate_node_num_mistakes(target_values)==0:
print 'Stopping condition 1 reached'
return create_leaf(target_values)
#stop conditoin 2, no more features to split on
if remaining_features==[]:
print 'Stopping condition 2 reached. No remaining features'
return create_leaf(target_values)
#early stop condition 1, reach the maximum depth
if current_depth>=max_depth:
print 'Reached maximum depth. Stopping for now'
return create_leaf(target_values)
#early stop condition 2, reach the minimum node size
if len(data)<=min_node_size:
print 'Early stopping condition reached, reach minimum node size'
return create_leaf(target_values)
#find the best splitting feature
splitting_feature = best_splitting_feature(data, features, target)
left_split = data[data[splitting_feature]==0]
right_split = data[data[splitting_feature]==1]
#early stopping condition 3, minimum error reduction
error_before_split = intermediate_node_num_mistakes(target_values)/float(len(data))
left_mistakes = intermediate_node_num_mistakes(left_split[target])
right_mistakes = intermediate_node_num_mistakes(right_split[target])
error_after_split = (left_mistakes+right_mistakes)/float(len(data))
if (error_before_split-error_after_split)<min_error_reduction:
print 'Early stopping condition 3, minimum error reduction'
return create_leaf(target_values)
remaining_features.remove(splitting_feature)
print 'Split on feature %s. (%s, %s)' % (splitting_feature, \
len(left_split), len(right_split))
if len(left_split)==len(data):
print 'Create leaf node'
return create_leaf(left_split[target])
if len(right_split)==len(data):
print 'Create leaf node'
return create_leaf(right_split[target])
#repeat (recurse) on left and right subtrees
left_tree = decision_tree_create(left_split, remaining_features, target, \
current_depth+1, max_depth, min_node_size, min_error_reduction)
right_tree = decision_tree_create(right_split, remaining_features, target, \
current_depth+1, max_depth, min_node_size, min_error_reduction)
return { 'is_leaf' : False,
'prediction' : None,
'splitting_feature': splitting_feature,
'left' : left_tree,
'right': right_tree}
features = train_data.column_names()
features.remove(target)
my_decision_tree_new = decision_tree_create(train_data, features, target, current_depth=0, max_depth=6,
min_node_size=100, min_error_reduction=0.0)
#make predictions
def classify(tree, x, annotate=False):
if tree['is_leaf']:
if annotate:
print 'At leaf, predicting %s' % tree['prediction']
return tree['prediction']
else:
split_feature_value = x[tree['splitting_feature']]
if annotate:
print 'Split on %s = %s' % (tree['splitting_feature'],
split_feature_value)
if split_feature_value==0:
return classify(tree['left'], x, annotate)
else:
return classify(tree['right'], x, annotate)
#prediction
print test_data[0]
print 'Predicted class: %s' % classify(my_decision_tree_new, test_data[0], annotate=True)
#evaluate the decision tree
def evaluate_classification_error(tree, data):
prediction = data.apply(lambda x: classify(tree, x))
return float(sum(data['safe_loans']!=prediction))/len(data)
print(evaluate_classification_error(my_decision_tree_new, test_data))
#print out a single decision stump
def print_stump(tree, name='root'):
split_name = tree['splitting_feature']
if split_name is None:
print '(leaf, lable: %s)' % tree['prediction']
return None
split_feature, split_value = split_name.split('.')
print ' [{0} == 0] [{0} == 1] '.format(split_name)
print ' (%s) (%s)' \
% (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))
print_stump(my_decision_tree_new)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment