shengch02 · December 28, 2016 22:00
diff --git a/explore various techniques for preventing overfitting in decision trees b/explore various techniques for preventing overfitting in decision trees
 #explore various techniques for preventing overfitting in decision trees
 import math
 import pandas as pd
 import numpy as np

 #the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or 
 #the loan with be charged off and possibly go into default
 import sframe
 loans = sframe.SFrame('lending-club-data.gl/')

 #target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
 loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
 loans = loans.remove_column('bad_loans')

 #use a subset of features (categorical and numeric)
 features = ['grade',   'emp_length', 'home_ownership',  'term']
 target = 'safe_loans'
 loans = loans[features+[target]]

 #combat class imbalance: undersample the large class until the class distribution is half and half
 safe_loans_raw = loans[loans[target]==+1]
 risky_loans_raw = loans[loans[target]==-1]
 percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
 risky_loans = risky_loans_raw
 safe_loans = safe_loans_raw.sample(percentage, seed=1)
 loans_data = risky_loans.append(safe_loans)

 #one-hot encoding: turn categorical variables into binary features
 categorical_variables = []
 for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
 	if feat_type == str:
 		categorical_variables.append(feat_name)
 for feature in categorical_variables:
 	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
 	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
 	for column in loans_data_unpacked.column_names():
 		loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
 	loans_data.remove_column(feature)
 	loans_data.add_columns(loans_data_unpacked)

 #training-test split
 train_data, test_data = loans_data.random_split(0.8, seed=1)

 #early stopping methods for decision trees
 #calculate whether the number of data points is less than or equal 
 #to the minimum node size
 def reached_minimum_node_size(data, min_node_size):
 	return len(data)<=min_node_size

 #compute the gain in error reduction
 def error_reduction(effor_before_split, error_after_split):
 	return effor_before_split-error_after_split

 #compute the number of misclassified examples
 def intermediate_node_num_mistakes(labels_in_node):
 	if len(labels_in_node) == 0:
 		return 0
 	else:
 		num_pos = sum(labels_in_node==1)
 		num_neg = sum(labels_in_node==-1)
 		return min(num_pos, num_neg)

 #function to pick best feature to split on
 def best_splitting_feature(data, features, target):
 	target_values = data[target]
 	best_feature = None
 	best_error = 10
 	num_data_points = float(len(data))
 	for feature in features:
 		left_split = data[data[feature]==0]
 		right_split = data[data[feature]==1]
 		left_mistakes = intermediate_node_num_mistakes(left_split[target])
 		right_mistakes = intermediate_node_num_mistakes(right_split[target])
 		error = (left_mistakes+right_mistakes)/num_data_points
 		if error < best_error:
 			best_error = error
 			best_feature = feature
 	return best_feature

 #a function to create a leaf node
 def create_leaf(target_values):
 	leaf = {'splitting_feature' : None,
 		'left':None,
 		'right':None,
 		'is_leaf':True
 	}
 	num_ones = len(target_values[target_values==1])
 	num_minus_ones = len(target_values[target_values==-1])
 	if num_ones > num_minus_ones:
 		leaf['prediction'] = 1
 	else:
 		leaf['prediction'] = -1
 	return leaf

 #build the tree
 def decision_tree_create(data, features, target, current_depth=0, max_depth=10, min_node_size=1,
 			min_error_reduction=0.0):
 	remaining_features = features[:]
 	target_values = data[target]
 	print '-----------------------------------------------------'
 	print 'Subtree, depth = %s (%s data points).' %(current_depth, len(target_values))
 	#stop condition 1, all points are from the same class
 	if intermediate_node_num_mistakes(target_values)==0:
 		print 'Stopping condition 1 reached'
 		return create_leaf(target_values)
 	#stop conditoin 2, no more features to split on
 	if remaining_features==[]:
 		print 'Stopping condition 2 reached. No remaining features'
 		return create_leaf(target_values)
 	#early stop condition 1, reach the maximum depth
 	if current_depth>=max_depth:
 		print 'Reached maximum depth. Stopping for now'
 		return create_leaf(target_values)
 	#early stop condition 2, reach the minimum node size
 	if len(data)<=min_node_size:
 		print 'Early stopping condition reached, reach minimum node size'
 		return create_leaf(target_values)
 	#find the best splitting feature
 	splitting_feature = best_splitting_feature(data, features, target)
 	left_split = data[data[splitting_feature]==0]
 	right_split = data[data[splitting_feature]==1]
 	#early stopping condition 3, minimum error reduction
 	error_before_split = intermediate_node_num_mistakes(target_values)/float(len(data))
 	left_mistakes = intermediate_node_num_mistakes(left_split[target])
 	right_mistakes = intermediate_node_num_mistakes(right_split[target])
 	error_after_split = (left_mistakes+right_mistakes)/float(len(data))
 	if (error_before_split-error_after_split)<min_error_reduction:
 		print 'Early stopping condition 3, minimum error reduction'
 		return create_leaf(target_values)
 	remaining_features.remove(splitting_feature)
 	print 'Split on feature %s. (%s, %s)' % (splitting_feature, \
 		len(left_split), len(right_split))
 	if len(left_split)==len(data):
 		print 'Create leaf node'
 		return create_leaf(left_split[target])
 	if len(right_split)==len(data):
 		print 'Create leaf node'
 		return create_leaf(right_split[target])
 	#repeat (recurse) on left and right subtrees
 	left_tree = decision_tree_create(left_split, remaining_features, target, \
 		current_depth+1, max_depth, min_node_size, min_error_reduction)
 	right_tree = decision_tree_create(right_split, remaining_features, target, \
 		current_depth+1, max_depth, min_node_size, min_error_reduction)
 	return { 'is_leaf' : False,
 		 'prediction' : None,
 		 'splitting_feature': splitting_feature,
 		 'left' : left_tree,
 		 'right': right_tree}
 features = train_data.column_names()
 features.remove(target)
 my_decision_tree_new =  decision_tree_create(train_data, features, target, current_depth=0, max_depth=6,
 			min_node_size=100, min_error_reduction=0.0)

 #make predictions
 def classify(tree, x, annotate=False):
 	if tree['is_leaf']:
 		if annotate:
 			print 'At leaf, predicting %s' % tree['prediction']
 		return tree['prediction']
 	else:
 		split_feature_value = x[tree['splitting_feature']]
 		if annotate:
 			print 'Split on %s = %s' % (tree['splitting_feature'],
 				split_feature_value)
 		if split_feature_value==0:
 			return classify(tree['left'], x, annotate)
 		else:
 			return classify(tree['right'], x, annotate)

 #prediction
 print test_data[0]
 print 'Predicted class: %s' % classify(my_decision_tree_new, test_data[0], annotate=True)

 #evaluate the decision tree
 def evaluate_classification_error(tree, data):
 	prediction = data.apply(lambda x: classify(tree, x))
 	return float(sum(data['safe_loans']!=prediction))/len(data)
 print(evaluate_classification_error(my_decision_tree_new, test_data))

 #print out a single decision stump
 def print_stump(tree, name='root'):
 	split_name = tree['splitting_feature']
 	if split_name is None:
 		print '(leaf, lable: %s)' % tree['prediction']
 		return None
 	split_feature, split_value = split_name.split('.')
  print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
  print '    (%s)                         (%s)' \
         % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))
 print_stump(my_decision_tree_new)
	#explore various techniques for preventing overfitting in decision trees
	import math
	import pandas as pd
	import numpy as np

	#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
	#the loan with be charged off and possibly go into default
	import sframe
	loans = sframe.SFrame('lending-club-data.gl/')

	#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
	loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
	loans = loans.remove_column('bad_loans')

	#use a subset of features (categorical and numeric)
	features = ['grade', 'emp_length', 'home_ownership', 'term']
	target = 'safe_loans'
	loans = loans[features+[target]]

	#combat class imbalance: undersample the large class until the class distribution is half and half
	safe_loans_raw = loans[loans[target]==+1]
	risky_loans_raw = loans[loans[target]==-1]
	percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
	risky_loans = risky_loans_raw
	safe_loans = safe_loans_raw.sample(percentage, seed=1)
	loans_data = risky_loans.append(safe_loans)

	#one-hot encoding: turn categorical variables into binary features
	categorical_variables = []
	for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
	if feat_type == str:
	categorical_variables.append(feat_name)
	for feature in categorical_variables:
	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
	for column in loans_data_unpacked.column_names():
	loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
	loans_data.remove_column(feature)
	loans_data.add_columns(loans_data_unpacked)

	#training-test split
	train_data, test_data = loans_data.random_split(0.8, seed=1)

	#early stopping methods for decision trees
	#calculate whether the number of data points is less than or equal
	#to the minimum node size
	def reached_minimum_node_size(data, min_node_size):
	return len(data)<=min_node_size

	#compute the gain in error reduction
	def error_reduction(effor_before_split, error_after_split):
	return effor_before_split-error_after_split

	#compute the number of misclassified examples
	def intermediate_node_num_mistakes(labels_in_node):
	if len(labels_in_node) == 0:
	return 0
	else:
	num_pos = sum(labels_in_node==1)
	num_neg = sum(labels_in_node==-1)
	return min(num_pos, num_neg)

	#function to pick best feature to split on
	def best_splitting_feature(data, features, target):
	target_values = data[target]
	best_feature = None
	best_error = 10
	num_data_points = float(len(data))
	for feature in features:
	left_split = data[data[feature]==0]
	right_split = data[data[feature]==1]
	left_mistakes = intermediate_node_num_mistakes(left_split[target])
	right_mistakes = intermediate_node_num_mistakes(right_split[target])
	error = (left_mistakes+right_mistakes)/num_data_points
	if error < best_error:
	best_error = error
	best_feature = feature
	return best_feature

	#a function to create a leaf node
	def create_leaf(target_values):
	leaf = {'splitting_feature' : None,
	'left':None,
	'right':None,
	'is_leaf':True
	}
	num_ones = len(target_values[target_values==1])
	num_minus_ones = len(target_values[target_values==-1])
	if num_ones > num_minus_ones:
	leaf['prediction'] = 1
	else:
	leaf['prediction'] = -1
	return leaf

	#build the tree
	def decision_tree_create(data, features, target, current_depth=0, max_depth=10, min_node_size=1,
	min_error_reduction=0.0):
	remaining_features = features[:]
	target_values = data[target]
	print '-----------------------------------------------------'
	print 'Subtree, depth = %s (%s data points).' %(current_depth, len(target_values))
	#stop condition 1, all points are from the same class
	if intermediate_node_num_mistakes(target_values)==0:
	print 'Stopping condition 1 reached'
	return create_leaf(target_values)
	#stop conditoin 2, no more features to split on
	if remaining_features==[]:
	print 'Stopping condition 2 reached. No remaining features'
	return create_leaf(target_values)
	#early stop condition 1, reach the maximum depth
	if current_depth>=max_depth:
	print 'Reached maximum depth. Stopping for now'
	return create_leaf(target_values)
	#early stop condition 2, reach the minimum node size
	if len(data)<=min_node_size:
	print 'Early stopping condition reached, reach minimum node size'
	return create_leaf(target_values)
	#find the best splitting feature
	splitting_feature = best_splitting_feature(data, features, target)
	left_split = data[data[splitting_feature]==0]
	right_split = data[data[splitting_feature]==1]
	#early stopping condition 3, minimum error reduction
	error_before_split = intermediate_node_num_mistakes(target_values)/float(len(data))
	left_mistakes = intermediate_node_num_mistakes(left_split[target])
	right_mistakes = intermediate_node_num_mistakes(right_split[target])
	error_after_split = (left_mistakes+right_mistakes)/float(len(data))
	if (error_before_split-error_after_split)<min_error_reduction:
	print 'Early stopping condition 3, minimum error reduction'
	return create_leaf(target_values)
	remaining_features.remove(splitting_feature)
	print 'Split on feature %s. (%s, %s)' % (splitting_feature, \
	len(left_split), len(right_split))
	if len(left_split)==len(data):
	print 'Create leaf node'
	return create_leaf(left_split[target])
	if len(right_split)==len(data):
	print 'Create leaf node'
	return create_leaf(right_split[target])
	#repeat (recurse) on left and right subtrees
	left_tree = decision_tree_create(left_split, remaining_features, target, \
	current_depth+1, max_depth, min_node_size, min_error_reduction)
	right_tree = decision_tree_create(right_split, remaining_features, target, \
	current_depth+1, max_depth, min_node_size, min_error_reduction)
	return { 'is_leaf' : False,
	'prediction' : None,
	'splitting_feature': splitting_feature,
	'left' : left_tree,
	'right': right_tree}
	features = train_data.column_names()
	features.remove(target)
	my_decision_tree_new = decision_tree_create(train_data, features, target, current_depth=0, max_depth=6,
	min_node_size=100, min_error_reduction=0.0)

	#make predictions
	def classify(tree, x, annotate=False):
	if tree['is_leaf']:
	if annotate:
	print 'At leaf, predicting %s' % tree['prediction']
	return tree['prediction']
	else:
	split_feature_value = x[tree['splitting_feature']]
	if annotate:
	print 'Split on %s = %s' % (tree['splitting_feature'],
	split_feature_value)
	if split_feature_value==0:
	return classify(tree['left'], x, annotate)
	else:
	return classify(tree['right'], x, annotate)

	#prediction
	print test_data[0]
	print 'Predicted class: %s' % classify(my_decision_tree_new, test_data[0], annotate=True)

	#evaluate the decision tree
	def evaluate_classification_error(tree, data):
	prediction = data.apply(lambda x: classify(tree, x))
	return float(sum(data['safe_loans']!=prediction))/len(data)
	print(evaluate_classification_error(my_decision_tree_new, test_data))

	#print out a single decision stump
	def print_stump(tree, name='root'):
	split_name = tree['splitting_feature']
	if split_name is None:
	print '(leaf, lable: %s)' % tree['prediction']
	return None
	split_feature, split_value = split_name.split('.')
	print ' [{0} == 0] [{0} == 1] '.format(split_name)
	print ' (%s) (%s)' \
	% (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
	('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))
	print_stump(my_decision_tree_new)