Created
April 13, 2015 10:19
-
-
Save aloknayak29/348e16e60954e8dd7b53 to your computer and use it in GitHub Desktop.
gridsearchcv cross validation, pipelining mmodels, Extending models
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import linear_model | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.cross_validation import train_test_split | |
import sklearn | |
import json | |
from sklearn.externals import joblib | |
from sklearn.pipeline import make_pipeline | |
from sklearn.grid_search import GridSearchCV | |
#function which train model and write info into file | |
f = open("yelp_review.json", "r") | |
stars=[] | |
entries=[] | |
i=0 | |
for line in f: | |
#if i==100: break | |
data = json.loads(line) | |
stars.append(data['stars']) | |
del data['stars'] | |
entries.append(data) | |
i+=1 | |
#a_train, a_test, b_train, b_test = train_test_split(entries, stars, test_size=0.25, random_state=142) | |
pipe = make_pipeline(CountVectorizer(stop_words='english'), linear_model.LinearRegression()) | |
params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0]) | |
grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1) | |
grid_search.fit([e['text'] for e in entries], stars) | |
print("best_params:",grid_search.best_params_) | |
print("grid_scores:", grid_search.grid_scores) | |
print("best_score:", grid_search.best_score_) | |
joblib.dump(grid_search.best_estimator_, "unipipe.pkl") | |
pipe = make_pipeline(TfidfVectorizer(stop_words='english', min_df=0.005,max_df=1.0), linear_model.LinearRegression()) | |
params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0]) | |
grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1) | |
grid_search.fit([e['text'] for e in entries], stars) | |
print("best_params:",grid_search.best_params_) | |
print("grid_scores:", grid_search.grid_scores) | |
print("best_score:", grid_search.best_score_) | |
joblib.dump(grid_search.best_estimator_, "tfidfpipe.pkl") | |
class TlinReg(LinearRegression): | |
def __init__(self, min_df=0.005,max_df=0.8,*args, **kwargs): | |
self.min_df = min_df | |
self.max_df = max_df | |
return super(TlinReg, self).__init__(*args, **kwargs) | |
def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs): | |
# Train the model using the training sets | |
vect = CountVectorizer(stop_words='english', min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2)) | |
vect.fit([e['text'] for e in X]) | |
self.vocabulary_ = vect.vocabulary_ | |
super(TlinReg, self).fit(vect.transform(e['text'] for e in X), y, *args, **kwargs) | |
return self | |
def predict(self,X, *args, **kwargs): | |
newvect = CountVectorizer(ngram_range=(2,2), vocabulary=self.vocabulary_) | |
return super(TlinReg, self).predict(newvect.transform(e['text'] for e in X), *args, **kwargs) | |
bimodel = TlinReg() | |
params = dict(min_df=[0.005,0.010,0.015], max_df=[0.8,0.9,0.95,1.0]) | |
grid_search = GridSearchCV(bimodel, param_grid=params, n_jobs=-1) | |
grid_search.fit(entries, stars) | |
print("best_params:",grid_search.best_params_) | |
print("grid_scores:", grid_search.grid_scores) | |
print("best_score:", grid_search.best_score_) | |
joblib.dump(grid_search.best_estimator_, "bipipe.pkl") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment