Last active
July 24, 2019 09:44
-
-
Save tanveer-sayyed/bf2e75e23ea0a508bbebfeadb0aafabe to your computer and use it in GitHub Desktop.
added n_estimators= 100 in RandomForest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
@author: tanveer | |
""" | |
"""On Spyder editor hit F5. On jupyter-notebook paste in a single cell and press ctrl+Enter. Run atleast 15 times.""" | |
threshold = 0.70 # TRY thresholds -> {0.72, 0.73, 0.74, 0.75} | |
import time | |
start = time.time() | |
valueCounts = {} | |
def CountAll(): | |
global all_columns, nanCounts, valueCounts, nanPercent | |
all_columns = list(df) | |
nanCounts = df.isnull().sum() | |
nanPercent = nanCounts / len(df) * 100 | |
for x in all_columns: | |
valueCounts[x] = df[x].value_counts() | |
"""-------------------------------------------------------------------------""" | |
"""Random but proportional replacement(RBPR) of categoricals.""" | |
def Fill_NaNs_Catigorical(col): | |
"""Calculating probability and expected value.""" | |
proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col] | |
proportion = np.around(proportion).astype('int') | |
"""Adjusting proportion.""" | |
diff = int(nanCounts[col] - np.sum(proportion)) | |
if diff > 0: | |
for x in range(diff): | |
idx = random.randint(0, len(proportion) - 1) | |
proportion[idx] = proportion[idx] + 1 | |
else: | |
diff = -diff | |
while(diff != 0): | |
idx = random.randint(0, len(proportion) - 1) | |
if proportion[idx] > 0: | |
proportion[idx] = proportion[idx] - 1 | |
diff = diff - 1 | |
"""Filling NaNs.""" | |
nan_indexes = df[df[col].isnull()].index.tolist() | |
for x in range(len(proportion)): | |
if proportion[x] > 0: | |
random_subset = random.sample(population = nan_indexes, k = proportion[x]) | |
df.loc[random_subset, col] = valueCounts[col].keys()[x] | |
nan_indexes = list(set(nan_indexes) - set(random_subset)) | |
"""-------------------------------------------------------------------------""" | |
"""Random but proportional replacement(RBPR) of numeric""" | |
def Fill_NaNs_Numeric(col): | |
mini = df[col].min() | |
maxi = df[col].max() | |
"""Selecting ONLY non-NaNs.""" | |
temp = df[df[col].notnull()][col] # type --> pd.Series | |
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded).""" | |
bin_size = 45 | |
bins = np.linspace(mini, maxi, bin_size) | |
"""Filling the bins (with non-NaNs) and calculating mean of each bin.""" | |
non_NaNs_per_bin = [] | |
mean_of_bins = [] | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])])) | |
mean_of_bins.append(temp[(temp <= bins[0])].mean()) | |
for x in range(1, bin_size): | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])])) | |
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean()) | |
mean_of_bins = pd.Series(mean_of_bins) | |
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence, | |
mean_of_bins.fillna(temp.mean(), inplace= True) | |
non_NaNs_per_bin = np.array(non_NaNs_per_bin) | |
"""Followoing part is SAME as Fill_NaNs_Catigorical()""" | |
"""Calculating probability and expected value.""" | |
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col] | |
proportion = np.around(proportion).astype('int') | |
"""Adjusting proportion.""" | |
diff = int(nanCounts[col] - np.sum(proportion)) | |
if diff > 0: | |
for x in range(diff): | |
idx = random.randint(0, len(proportion) - 1) | |
proportion[idx] = proportion[idx] + 1 | |
else: | |
diff = -diff | |
while(diff != 0): | |
idx = random.randint(0, len(proportion) - 1) | |
if proportion[idx] > 0: | |
proportion[idx] = proportion[idx] - 1 | |
diff = diff - 1 | |
"""Filling NaNs.""" | |
nan_indexes = df[df[col].isnull()].index.tolist() | |
for x in range(len(proportion)): | |
if proportion[x] > 0: | |
random_subset = random.sample(population= nan_indexes, k= proportion[x]) | |
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean | |
nan_indexes = list(set(nan_indexes) - set(random_subset)) | |
"""-------------------------------------------------------------------------""" | |
import pandas as pd | |
import numpy as np | |
import random | |
from sklearn.datasets import load_iris | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split as tts | |
from sklearn.metrics import classification_report | |
from yellowbrick.classifier import PrecisionRecallCurve | |
import matplotlib.pyplot as plt | |
import warnings | |
warnings.filterwarnings('ignore') | |
# Important so that results are reproducible | |
np.random.seed = 0 | |
random.seed = 0 | |
""" STEP-1 """ | |
iris = load_iris() | |
# Already free of impurities so .copy() not required | |
df = pd.DataFrame(iris.data, columns=iris.feature_names) | |
df['target'] = iris.target | |
df['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True) | |
df['target'] = df['target'].astype('object') | |
# Removing 20% values from each column | |
idx_sepal_length = list(random.sample(range(len(df)), 30)) | |
idx_sepal_width = list(random.sample(range(len(df)), 30)) | |
idx_petal_length = list(random.sample(range(len(df)), 30)) | |
idx_petal_width = list(random.sample(range(len(df)), 30)) | |
idx_target = list(random.sample(range(len(df)), 30)) | |
df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN | |
df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN | |
df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN | |
df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN | |
df.loc[idx_target, 'target'] = np.NaN | |
""" STEP-2 """ | |
CountAll() | |
""" STEP-3 """ | |
numerical = list(df.columns[df.dtypes == np.number]) | |
""" STEP-4 """ | |
categorical = list(df.columns[df.dtypes != np.number]) | |
""" STEP-5 """ | |
original_mean = pd.Series() | |
imputed_mean = pd.Series() | |
for col in numerical: | |
original_mean[col] = df[col].mean() | |
Fill_NaNs_Numeric(col) | |
imputed_mean[col] = df[col].mean() | |
print() | |
print('% change in mean:') | |
print((original_mean - imputed_mean)/original_mean*100) | |
print() | |
del original_mean, imputed_mean | |
""" STEP-6 """ | |
for col in categorical: | |
print('Mode before imputing target: ', df[col].mode()[0]) | |
Fill_NaNs_Catigorical(col) | |
print('Mode after imputing target: ', df[col].mode()[0]) | |
""" STEP-7 """ | |
X = df.drop(columns= 'target') | |
y = df['target'] | |
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified | |
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy') | |
dtC.fit(X_train, y_train) | |
y_pred = dtC.predict(X_test) | |
#print(classification_report(y_test, y_pred, output_dict=False)) | |
classif_report = classification_report(y_test, y_pred, output_dict=True) | |
print('\n::BEFORE tuning (test scores for DecisionTreeClassifier)') | |
print('setosa recall: ', classif_report['setosa']['recall']) | |
print('virginica recall: ', classif_report['virginica']['recall']) | |
print('versicolor recall: ', classif_report['versicolor']['recall']) | |
""" STEP-8 """ | |
condition = 1 | |
no_of_attempts = 0 | |
while(not((classif_report['setosa']['recall'] >= threshold) and | |
(classif_report['virginica']['recall'] >= threshold) and | |
(classif_report['versicolor']['recall'] >= threshold))): | |
df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN | |
df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN | |
df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN | |
df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN | |
df.loc[idx_target, 'target'] = np.NaN | |
CountAll() | |
for col in numerical: | |
Fill_NaNs_Numeric(col) | |
for col in categorical: | |
Fill_NaNs_Catigorical(col) | |
X = df.drop(columns= 'target') | |
y = df['target'] | |
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified | |
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy') | |
dtC.fit(X_train, y_train) | |
y_pred = dtC.predict(X_test) | |
classif_report = classification_report(y_test, y_pred, output_dict=True) | |
if no_of_attempts == 40: | |
condition = 0 | |
break | |
no_of_attempts = no_of_attempts + 1 | |
if condition: | |
print('\n::AFTER tuning (test scores for DecisionTreeClassifier)') | |
print('setosa recall: ', classif_report['setosa']['recall']) | |
print('virginica recall: ', classif_report['virginica']['recall']) | |
print('versicolor recall: ', classif_report['versicolor']['recall']) | |
print('\nno of attempts in while loop: ', no_of_attempts) | |
fig = plt.figure(figsize=(8,8)) | |
X = df.drop(columns= 'target') | |
y = df['target'] | |
y, uniques = pd.factorize(y) | |
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified | |
viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators= 100), per_class=True, iso_f1_curves=True, fill_area=False, micro=False) | |
viz.fit(X_train, y_train) | |
viz.score(X_test, y_test) | |
viz.poof() | |
print('(Above curves plotted after one more train test split)') | |
else: | |
print("\nCondition unreachable in 40 loops.") | |
end = time.time() | |
print('\nTime taken: ', str(end-start)) | |
del X, X_test, X_train, all_columns, categorical, classif_report, col, df, end | |
del iris, nanCounts, nanPercent, no_of_attempts, numerical, start, valueCounts | |
del idx_sepal_length, idx_sepal_width, idx_petal_length, idx_petal_width, idx_target, threshold | |
del y, y_pred, y_test, y_train, condition |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment