MattJBritton · May 3, 2020 03:18
diff --git a/titanic_sklearn_decision_tree.py b/titanic_sklearn_decision_tree.py
 # imports
 import pandas as pd
 import numpy as np
 from scipy.stats import entropy
 from sklearn.tree import DecisionTreeClassifier, export_text
 from sklearn.impute import KNNImputer, SimpleImputer

 # dataset from https://www.kaggle.com/c/titanic/data
 # Load dataset
 titanic_df = pd.read_csv("data/titanic_train.csv")
 target = "Survived"
 # Some Feature Engineering and Cleaning
 titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
 titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
 titanic_df["Age"] = titanic_df["Age"].round(0)
 # Drop some features we don't need for this example
 titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

 # impute missing Embarked values
 # only 2 missing, just use SimpleImputer
 embark_imputer = SimpleImputer(strategy="most_frequent")
 titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

 # impute missing Age values
 # to do this first dummy encode the dataset as KNNImputer requires numeric values
 data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
 age_imputer = KNNImputer()
 dummies_imputed = age_imputer.fit_transform(data_for_ml)
 data_for_ml = pd.DataFrame(
    dummies_imputed,
    columns = data_for_ml.columns
 )

 # Build X and y for passing to classifier
 X = data_for_ml.drop(target, axis=1)
 y = data_for_ml[target]
 # Instantiate classifier
 sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
 # Fit the model to our dataset (which builds the decision tree)
 sklearn_dt.fit(X, y)

 # Calculate the total information gain from the root to the grandchild leaf nodes
 leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
 leaf_node_entropies = np.apply_along_axis(
    lambda x: entropy(x, base=2),
    2,
    leaf_nodes
 )
 leaf_node_sizes = np.apply_along_axis(
    np.sum,
    2,
    leaf_nodes
 )
 print(
    f"Total Information Gain: "
    f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
 )

 # Use the model's score function to calculate the accuracy
 print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")

 # This code prints the structure of the decision tree
 # It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
 print("Structure of Decision Tree")
 print(
    export_text(
        sklearn_dt,
        feature_names = list(X.columns)
    ).replace(
        "class: 1.0", "Survived"
    ).replace(
        "class: 0.0", "Perished"
    ).replace(
        "<= 0.50", "is False"
    ).replace(
        ">  0.50", "is True"
    )
 )
	# imports
	import pandas as pd
	import numpy as np
	from scipy.stats import entropy
	from sklearn.tree import DecisionTreeClassifier, export_text
	from sklearn.impute import KNNImputer, SimpleImputer

	# dataset from https://www.kaggle.com/c/titanic/data
	# Load dataset
	titanic_df = pd.read_csv("data/titanic_train.csv")
	target = "Survived"
	# Some Feature Engineering and Cleaning
	titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
	titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
	titanic_df["Age"] = titanic_df["Age"].round(0)
	# Drop some features we don't need for this example
	titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

	# impute missing Embarked values
	# only 2 missing, just use SimpleImputer
	embark_imputer = SimpleImputer(strategy="most_frequent")
	titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

	# impute missing Age values
	# to do this first dummy encode the dataset as KNNImputer requires numeric values
	data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
	age_imputer = KNNImputer()
	dummies_imputed = age_imputer.fit_transform(data_for_ml)
	data_for_ml = pd.DataFrame(
	dummies_imputed,
	columns = data_for_ml.columns
	)

	# Build X and y for passing to classifier
	X = data_for_ml.drop(target, axis=1)
	y = data_for_ml[target]
	# Instantiate classifier
	sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
	# Fit the model to our dataset (which builds the decision tree)
	sklearn_dt.fit(X, y)

	# Calculate the total information gain from the root to the grandchild leaf nodes
	leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
	leaf_node_entropies = np.apply_along_axis(
	lambda x: entropy(x, base=2),
	2,
	leaf_nodes
	)
	leaf_node_sizes = np.apply_along_axis(
	np.sum,
	2,
	leaf_nodes
	)
	print(
	f"Total Information Gain: "
	f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
	)

	# Use the model's score function to calculate the accuracy
	print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")

	# This code prints the structure of the decision tree
	# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
	print("Structure of Decision Tree")
	print(
	export_text(
	sklearn_dt,
	feature_names = list(X.columns)
	).replace(
	"class: 1.0", "Survived"
	).replace(
	"class: 0.0", "Perished"
	).replace(
	"<= 0.50", "is False"
	).replace(
	"> 0.50", "is True"
	)
	)