Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created July 19, 2017 01:40
Show Gist options
  • Save shengch02/f3e01cb9ab24b89726961f2ca7f5378e to your computer and use it in GitHub Desktop.
Save shengch02/f3e01cb9ab24b89726961f2ca7f5378e to your computer and use it in GitHub Desktop.
Linear Model
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import cross_val_score, train_test_split
#input the data
data0 = pd.read_csv('./data/train.csv', sep=',', header=False, names=range(378))
data = data0
#transform the categorical data into numerical integers
for col in range(2,10):
nofc = data.shape[1]
for i in range(len(data[col].unique())):
data[data.shape[1]]=0
mapdic = {}
for i, ai in enumerate(list(data[col].unique())):
mapdic[ai]=i
data[col] = data[col].map(mapdic)
for i in range(data.shape[0]):
data[nofc+data[col][i]][i]=1
X = data
#remove the features with proportion of value 1 smaller than 0.01
predictors=range(10, 572)
for i in range(10, 572):
if float(sum(X[i]==1))/X.shape[0]<0.01:
predictors.remove(i)
target = 1
#feature selection RFE(), and regression modle LinearRegression()
X = np.asarray(data[predictors])
Y = np.asarray(data[target])
model = LinearRegression()
for nfeature in range(170, 171, 2):
ssttdd = []
# do 40 independent training in order to estimate the mean and std of the R2 score
for ii in range(0, 40):
rfe = RFE(model, nfeature)
fit = rfe.fit(X, Y)
predictor = [predictors[i] for i in range(len(fit.support_)) if fit.support_[i]]
train, test = train_test_split(data, test_size=0.3)
mn = train[target].mean()
std = train[target].std()
lr = LinearRegression()
lr.fit(train[predictor], train[target])
prt = lr.predict(test[predictor])
for ii in range(len(prt)):
if np.abs(prt[ii]-mn)>40.0*std:
prt[ii]=mn
ssttdd.append((prt-test[target]).std())
print np.mean(ssttdd)
print np.std(ssttdd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment