Last active
May 3, 2020 03:18
-
-
Save MattJBritton/4783106e40aa0d2f7d7d00f6c027bf83 to your computer and use it in GitHub Desktop.
InteractiveDecisionTrees Titanic simple sklearn model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# imports | |
import pandas as pd | |
import numpy as np | |
from scipy.stats import entropy | |
from sklearn.tree import DecisionTreeClassifier, export_text | |
from sklearn.impute import KNNImputer, SimpleImputer | |
# dataset from https://www.kaggle.com/c/titanic/data | |
# Load dataset | |
titanic_df = pd.read_csv("data/titanic_train.csv") | |
target = "Survived" | |
# Some Feature Engineering and Cleaning | |
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"] | |
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"}) | |
titanic_df["Age"] = titanic_df["Age"].round(0) | |
# Drop some features we don't need for this example | |
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1) | |
# impute missing Embarked values | |
# only 2 missing, just use SimpleImputer | |
embark_imputer = SimpleImputer(strategy="most_frequent") | |
titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1)) | |
# impute missing Age values | |
# to do this first dummy encode the dataset as KNNImputer requires numeric values | |
data_for_ml = pd.get_dummies(titanic_df, drop_first=True) | |
age_imputer = KNNImputer() | |
dummies_imputed = age_imputer.fit_transform(data_for_ml) | |
data_for_ml = pd.DataFrame( | |
dummies_imputed, | |
columns = data_for_ml.columns | |
) | |
# Build X and y for passing to classifier | |
X = data_for_ml.drop(target, axis=1) | |
y = data_for_ml[target] | |
# Instantiate classifier | |
sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2) | |
# Fit the model to our dataset (which builds the decision tree) | |
sklearn_dt.fit(X, y) | |
# Calculate the total information gain from the root to the grandchild leaf nodes | |
leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right] | |
leaf_node_entropies = np.apply_along_axis( | |
lambda x: entropy(x, base=2), | |
2, | |
leaf_nodes | |
) | |
leaf_node_sizes = np.apply_along_axis( | |
np.sum, | |
2, | |
leaf_nodes | |
) | |
print( | |
f"Total Information Gain: " | |
f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}" | |
) | |
# Use the model's score function to calculate the accuracy | |
print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}") | |
# This code prints the structure of the decision tree | |
# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability | |
print("Structure of Decision Tree") | |
print( | |
export_text( | |
sklearn_dt, | |
feature_names = list(X.columns) | |
).replace( | |
"class: 1.0", "Survived" | |
).replace( | |
"class: 0.0", "Perished" | |
).replace( | |
"<= 0.50", "is False" | |
).replace( | |
"> 0.50", "is True" | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment