Created
July 19, 2017 01:44
-
-
Save shengch02/fa533bafc55624dd352fd95ebc47bb6a to your computer and use it in GitHub Desktop.
outliers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#utilizing one-hot-encoding, Randomforest, and xgboost to predict the outliers | |
import numpy as np | |
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin | |
from sklearn.preprocessing import LabelEncoder | |
import xgboost as xgb | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
from sklearn.ensemble import GradientBoostingClassifier | |
import time | |
from sklearn.model_selection import KFold | |
from sklearn.ensemble import RandomForestClassifier | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.metrics import roc_curve | |
from sklearn.linear_model import LogisticRegression | |
#input the training data | |
train0 = pd.read_csv('./data/train.csv') | |
#define the ones larger than 120 as outliers | |
train0['class'] = (train0['y']>120).astype(int) | |
train0 = train0.drop('y', axis=1).reset_index(drop=False) | |
#one-hot-encoding to transform the categorical data into integers | |
one_hot_cols = [] | |
for c in train0.columns: | |
if train0[c].dtype == 'object': | |
lbl = LabelEncoder() | |
lbl.fit(list(train0[c].values)) | |
train0[c] = lbl.transform(list(train0[c].values)) | |
one_hot_cols.append(c) | |
usable_columns = list(set(train0.columns)-set(['ID'])-set(one_hot_cols)) | |
train_one_hot = train0[one_hot_cols] | |
enc = OneHotEncoder() | |
enc.fit(train0[one_hot_cols].values) | |
one_hot = enc.transform(train0[one_hot_cols].values).toarray() | |
one_hot = pd.DataFrame(one_hot, index=range(len(one_hot)), columns=range(one_hot.shape[1])).reset_index(drop=False) | |
train0 = one_hot.merge(train0[usable_columns], on='index', how='left') | |
train0 = train0.drop('index', axis=1) | |
#shuffle the training data | |
train0 = train0.reindex(np.random.permutation(train0.index)) | |
train0.index = range(len(train0)) | |
#cross validation | |
kf = KFold(n_splits=5) | |
for train_index, test_index in kf.split(train0): | |
train, test = train0.ix[train_index], train0.ix[test_index] | |
# RandomForestClassifier | |
rf = RandomForestClassifier(max_depth=3, n_estimators=10) | |
rf.fit(train.drop('class', axis=1), train['class']) | |
y_pred_rf = rf.predict_proba(test.drop('class', axis=1))[:,1] | |
fpr_rf, tpr_rf, _ = roc_curve(test['class'].values, y_pred_rf) | |
plt.plot(fpr_rf, tpr_rf, label='RF') | |
# RandomForestClassifer + LogisticRegression | |
rf = RandomForestClassifier(max_depth=3, n_estimators=10) | |
rf_enc = OneHotEncoder() | |
rf_lm = LogisticRegression() | |
rf.fit(train.drop('class', axis=1)[:len(train)/2], train['class'][:len(train)/2]) | |
rf_enc.fit(rf.apply(train.drop('class', axis=1)[:len(train)/2])) | |
rf_lm.fit(rf_enc.transform(rf.apply(train.drop('class', axis=1)[len(train)/2:])), train['class'][len(train)/2:]) | |
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test.drop('class', axis=1))))[:,1] | |
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(test['class'], y_pred_rf_lm) | |
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF_LR') | |
# GradientBoostingClassifier | |
grt = GradientBoostingClassifier(n_estimators=10) | |
grt.fit(train.drop('class', axis=1), train['class']) | |
y_pred_grt = grt.predict_proba(test.drop('class', axis=1))[:,1] | |
fpr_grt, tpr_grt, _ = roc_curve(test['class'].values, y_pred_grt) | |
plt.plot(fpr_grt, tpr_grt, label='GRT') | |
# GradientBoostingClassifier + LogisticRegression | |
grt = GradientBoostingClassifier(n_estimators=10) | |
grt.fit(train.drop('class', axis=1)[:len(train)/2], train['class'][:len(train)/2]) | |
grt_enc = OneHotEncoder() | |
grt_lm = LogisticRegression() | |
grt_enc.fit(grt.apply(train.drop('class', axis=1)[:len(train)/2])[:,:,0]) | |
grt_lm.fit(grt_enc.transform(grt.apply(train.drop('class', axis=1)[len(train)/2:])[:,:,0]), train['class'][len(train)/2:]) | |
y_pred_grt_lm = grt_lm.predict_proba(grt_enc.transform(grt.apply(test.drop('class', axis=1))[:,:,0]))[:,1] | |
fpr_grt_lm, tpr_grt_lm, _ = roc_curve(test['class'].values, y_pred_grt_lm) | |
plt.plot(fpr_grt_lm, tpr_grt_lm, label='GRT+LR') | |
plt.legend(loc='best') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment