Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created July 19, 2017 01:44
Show Gist options
  • Save shengch02/fa533bafc55624dd352fd95ebc47bb6a to your computer and use it in GitHub Desktop.
Save shengch02/fa533bafc55624dd352fd95ebc47bb6a to your computer and use it in GitHub Desktop.
outliers
#utilizing one-hot-encoding, Randomforest, and xgboost to predict the outliers
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import GradientBoostingClassifier
import time
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
#input the training data
train0 = pd.read_csv('./data/train.csv')
#define the ones larger than 120 as outliers
train0['class'] = (train0['y']>120).astype(int)
train0 = train0.drop('y', axis=1).reset_index(drop=False)
#one-hot-encoding to transform the categorical data into integers
one_hot_cols = []
for c in train0.columns:
if train0[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train0[c].values))
train0[c] = lbl.transform(list(train0[c].values))
one_hot_cols.append(c)
usable_columns = list(set(train0.columns)-set(['ID'])-set(one_hot_cols))
train_one_hot = train0[one_hot_cols]
enc = OneHotEncoder()
enc.fit(train0[one_hot_cols].values)
one_hot = enc.transform(train0[one_hot_cols].values).toarray()
one_hot = pd.DataFrame(one_hot, index=range(len(one_hot)), columns=range(one_hot.shape[1])).reset_index(drop=False)
train0 = one_hot.merge(train0[usable_columns], on='index', how='left')
train0 = train0.drop('index', axis=1)
#shuffle the training data
train0 = train0.reindex(np.random.permutation(train0.index))
train0.index = range(len(train0))
#cross validation
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(train0):
train, test = train0.ix[train_index], train0.ix[test_index]
# RandomForestClassifier
rf = RandomForestClassifier(max_depth=3, n_estimators=10)
rf.fit(train.drop('class', axis=1), train['class'])
y_pred_rf = rf.predict_proba(test.drop('class', axis=1))[:,1]
fpr_rf, tpr_rf, _ = roc_curve(test['class'].values, y_pred_rf)
plt.plot(fpr_rf, tpr_rf, label='RF')
# RandomForestClassifer + LogisticRegression
rf = RandomForestClassifier(max_depth=3, n_estimators=10)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(train.drop('class', axis=1)[:len(train)/2], train['class'][:len(train)/2])
rf_enc.fit(rf.apply(train.drop('class', axis=1)[:len(train)/2]))
rf_lm.fit(rf_enc.transform(rf.apply(train.drop('class', axis=1)[len(train)/2:])), train['class'][len(train)/2:])
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test.drop('class', axis=1))))[:,1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(test['class'], y_pred_rf_lm)
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF_LR')
# GradientBoostingClassifier
grt = GradientBoostingClassifier(n_estimators=10)
grt.fit(train.drop('class', axis=1), train['class'])
y_pred_grt = grt.predict_proba(test.drop('class', axis=1))[:,1]
fpr_grt, tpr_grt, _ = roc_curve(test['class'].values, y_pred_grt)
plt.plot(fpr_grt, tpr_grt, label='GRT')
# GradientBoostingClassifier + LogisticRegression
grt = GradientBoostingClassifier(n_estimators=10)
grt.fit(train.drop('class', axis=1)[:len(train)/2], train['class'][:len(train)/2])
grt_enc = OneHotEncoder()
grt_lm = LogisticRegression()
grt_enc.fit(grt.apply(train.drop('class', axis=1)[:len(train)/2])[:,:,0])
grt_lm.fit(grt_enc.transform(grt.apply(train.drop('class', axis=1)[len(train)/2:])[:,:,0]), train['class'][len(train)/2:])
y_pred_grt_lm = grt_lm.predict_proba(grt_enc.transform(grt.apply(test.drop('class', axis=1))[:,:,0]))[:,1]
fpr_grt_lm, tpr_grt_lm, _ = roc_curve(test['class'].values, y_pred_grt_lm)
plt.plot(fpr_grt_lm, tpr_grt_lm, label='GRT+LR')
plt.legend(loc='best')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment