Created
November 7, 2018 04:36
-
-
Save raven4752/56eab728df8ae53619a4307f38ebecd9 to your computer and use it in GitHub Desktop.
machine learning routine code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.model_selection import KFold | |
from sklearn.metrics import roc_auc_score | |
import pandas as pd | |
import numpy as np | |
import os | |
from matplotlib import pyplot as plt | |
class ModelTester: | |
def __init__(self, ds_dir='adult_dataset'): | |
xtr_path = os.path.join(ds_dir, 'xtr.txt') | |
ytr_path = os.path.join(ds_dir, 'ytr.txt') | |
xte_path = os.path.join(ds_dir, 'xte.txt') | |
yte_path = os.path.join(ds_dir, 'yte.txt') | |
self.xtr = np.genfromtxt(xtr_path, delimiter=' ') | |
self.ytr = np.genfromtxt(ytr_path, delimiter=' ') | |
self.xte = np.genfromtxt(xte_path, delimiter=' ') | |
self.yte = np.genfromtxt(yte_path, delimiter=' ') | |
def cross_valid_model_predictions(self, model_train_predict_func, num_folds=10, seed=1, **kwargs): | |
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed) | |
predictions_all_fold = [] | |
label_all_fold = [] | |
for tr_index, te_index in kfold.split(self.xtr): | |
xgr = self.xtr[tr_index] | |
ygr = self.ytr[tr_index] | |
xva = self.xtr[te_index] | |
yva = self.ytr[te_index] | |
predictions = model_train_predict_func(xgr, ygr, xva, yva, **kwargs) | |
predictions_all_fold.append(predictions) | |
label_all_fold.append(yva) | |
return np.concatenate(predictions_all_fold), np.concatenate(label_all_fold) | |
def cross_valid_model(self, model_train_predict_func, num_folds=10, seed=1, score='auc', **kwargs): | |
p, l = self.cross_valid_model_predictions(model_train_predict_func=model_train_predict_func, | |
num_folds=num_folds, seed=seed, **kwargs) | |
if score == 'auc': | |
return roc_auc_score(l, p) | |
def predict_test(self, model_train_predict_func): | |
return model_train_predict_func(self.xtr, self.ytr, self.xte, self.yte) | |
def rf_func(xtr, ytr, xte, yte, **kwargs): | |
cls = RandomForestClassifier(**kwargs) | |
cls.fit(xtr, ytr) | |
return cls.predict_proba(xte)[:, 1] | |
def adaboost_func(xtr, ytr, xte, yte, **kwargs): | |
cls = AdaBoostClassifier(**kwargs) | |
cls.fit(xtr, ytr) | |
return cls.predict_proba(xte)[:, 1] | |
def get_benchmark_score(n_estimators=50): | |
mt = ModelTester() | |
score_rf = mt.cross_valid_model(rf_func, n_estimators=n_estimators) | |
score_ada = mt.cross_valid_model(adaboost_func, n_estimators=n_estimators) | |
print(score_rf) # 0.900185583187 | |
print(score_ada) # 0.909267081712 | |
return score_rf, score_ada | |
def plot_benchmark_scores(start=10, end=110, step=10): | |
scores = [] | |
for i in range(start, end, step): | |
scores.append(get_benchmark_score(n_estimators=i)) | |
plt.figure() | |
plt.title('roc-auc score of random forest and adaboost') | |
plt.xlabel('num estimator') | |
plt.ylabel('roc-auc score') | |
x = np.arange(start=start, stop=end, step=step) | |
plt.xticks(x) | |
plt.plot(x, list(sc[0] for sc in scores), label='RandomForest') | |
plt.plot(x, list(sc[1] for sc in scores), label='AdaBoost') | |
plt.legend() | |
plt.show() | |
if __name__ == '__main__': | |
plot_benchmark_scores() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment