Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created July 19, 2017 01:43
Show Gist options
  • Save shengch02/6b5e00e305808e99675cafe40424229f to your computer and use it in GitHub Desktop.
Save shengch02/6b5e00e305808e99675cafe40424229f to your computer and use it in GitHub Desktop.
XGB + Pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
#matplotlib inline
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime
import operator
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score
from sklearn.decomposition import TruncatedSVD
from itertools import combinations
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA, FastICA
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import LabelEncoder
class StackingEstimator(BaseEstimator, TransformerMixin):
def __init__(self, estimator):
self.estimator = estimator
def fit(self, X, y=None, **fit_params):
self.estimator.fit(X, y, **fit_params)
return self
def transform(self, X):
X = check_array(X)
X_transformed = np.copy(X)
# add class probabilities as a synthetic feature
if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
X_transformed = np.hstack((self.estimator.predict_proba(X), X))
# add class prodiction as a synthetic feature
X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))
return X_transformed
train0 = pd.read_csv('./data/train.csv')
r2mean = []
r2std = []
for ea in np.arange(0.01, 0.03, 0.005):
r2 = []
for iij in range(5):
train, test = train_test_split(train0, train_size=0.7, random_state=iij)
y_train = train['y']
y_test = test['y']
test = test.drop(['y'], axis=1)
for c in train.columns:
if train[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
n_comp = 12
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)
# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)
#save columns list before adding the decomposition components
usable_columns = list(set(train.columns) - set(['y']))
# Append decomposition components to datasets
for i in range(1, n_comp + 1):
train['pca_' + str(i)] = pca2_results_train[:, i - 1]
test['pca_' + str(i)] = pca2_results_test[:, i - 1]
train['ica_' + str(i)] = ica2_results_train[:, i - 1]
test['ica_' + str(i)] = ica2_results_test[:, i - 1]
train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
train['grp_' + str(i)] = grp_results_train[:, i - 1]
test['grp_' + str(i)] = grp_results_test[:, i - 1]
train['srp_' + str(i)] = srp_results_train[:, i - 1]
test['srp_' + str(i)] = srp_results_test[:, i - 1]
#usable_columns = list(set(train.columns) - set(['y']))
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays)
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values
xgb_params = {
'n_trees': 520,
'eta': 0.02,
'max_depth': 4,
'subsample': 0.93,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'base_score': y_mean, # base prediction = mean(target)
'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)
num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
stacked_pipeline = make_pipeline(
StackingEstimator(estimator=LassoLarsCV(normalize=True)),
StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
LassoLarsCV()
)
stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.125 + results*0.875
r2.append(r2_score(y_test, sub['y']))
r2mean.append(np.mean(r2))
r2std.append(np.std(r2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment