aloknayak29 · April 13, 2015 10:19
diff --git a/main_pipelines.py b/main_pipelines.py
 import numpy as np
 from sklearn import linear_model
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.cross_validation import train_test_split
 import sklearn
 import json
 from sklearn.externals import joblib
 from sklearn.pipeline import make_pipeline
 from sklearn.grid_search import GridSearchCV
 #function which train model and write info into file

 f = open("yelp_review.json", "r")
 stars=[]
 entries=[]
 i=0
 for line in f:
    #if i==100: break
    data = json.loads(line)
    stars.append(data['stars'])
    del data['stars']
    entries.append(data)
    i+=1


 #a_train, a_test, b_train, b_test = train_test_split(entries, stars, test_size=0.25, random_state=142)


 pipe = make_pipeline(CountVectorizer(stop_words='english'), linear_model.LinearRegression())
 params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0])
 grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1)
 grid_search.fit([e['text'] for e in entries], stars)
 print("best_params:",grid_search.best_params_)
 print("grid_scores:", grid_search.grid_scores)
 print("best_score:", grid_search.best_score_)
 joblib.dump(grid_search.best_estimator_, "unipipe.pkl")



 pipe = make_pipeline(TfidfVectorizer(stop_words='english', min_df=0.005,max_df=1.0), linear_model.LinearRegression())
 params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0])
 grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1)
 grid_search.fit([e['text'] for e in entries], stars)
 print("best_params:",grid_search.best_params_)
 print("grid_scores:", grid_search.grid_scores)
 print("best_score:", grid_search.best_score_)
 joblib.dump(grid_search.best_estimator_, "tfidfpipe.pkl")


 class TlinReg(LinearRegression):

    def __init__(self, min_df=0.005,max_df=0.8,*args, **kwargs):
        self.min_df = min_df
        self.max_df = max_df
        return super(TlinReg, self).__init__(*args, **kwargs)

    def fit(self, X, y, min_df=0.005,max_df=0.8, *args, **kwargs):
        # Train the model using the training sets
        vect =  CountVectorizer(stop_words='english', min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2))
        vect.fit([e['text'] for e in X])
        self.vocabulary_ = vect.vocabulary_
        super(TlinReg, self).fit(vect.transform(e['text'] for e in X), y, *args, **kwargs)
        return self

    def predict(self,X, *args, **kwargs):
        newvect =  CountVectorizer(ngram_range=(2,2), vocabulary=self.vocabulary_)
        return super(TlinReg, self).predict(newvect.transform(e['text'] for e in X), *args, **kwargs)

 bimodel = TlinReg()
 params = dict(min_df=[0.005,0.010,0.015], max_df=[0.8,0.9,0.95,1.0])
 grid_search = GridSearchCV(bimodel, param_grid=params, n_jobs=-1)
 grid_search.fit(entries, stars)
 print("best_params:",grid_search.best_params_)
 print("grid_scores:", grid_search.grid_scores)
 print("best_score:", grid_search.best_score_)
 joblib.dump(grid_search.best_estimator_, "bipipe.pkl")
	import numpy as np
	from sklearn import linear_model
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.cross_validation import train_test_split
	import sklearn
	import json
	from sklearn.externals import joblib
	from sklearn.pipeline import make_pipeline
	from sklearn.grid_search import GridSearchCV
	#function which train model and write info into file

	f = open("yelp_review.json", "r")
	stars=[]
	entries=[]
	i=0
	for line in f:
	#if i==100: break
	data = json.loads(line)
	stars.append(data['stars'])
	del data['stars']
	entries.append(data)
	i+=1


	#a_train, a_test, b_train, b_test = train_test_split(entries, stars, test_size=0.25, random_state=142)


	pipe = make_pipeline(CountVectorizer(stop_words='english'), linear_model.LinearRegression())
	params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0])
	grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1)
	grid_search.fit([e['text'] for e in entries], stars)
	print("best_params:",grid_search.best_params_)
	print("grid_scores:", grid_search.grid_scores)
	print("best_score:", grid_search.best_score_)
	joblib.dump(grid_search.best_estimator_, "unipipe.pkl")



	pipe = make_pipeline(TfidfVectorizer(stop_words='english', min_df=0.005,max_df=1.0), linear_model.LinearRegression())
	params = dict(countvectorizer__min_df=[0.005,0.010,0.015], countvectorizer__max_df=[0.8,0.9,0.95,1.0])
	grid_search = GridSearchCV(pipe, param_grid=params, n_jobs=-1)
	grid_search.fit([e['text'] for e in entries], stars)
	print("best_params:",grid_search.best_params_)
	print("grid_scores:", grid_search.grid_scores)
	print("best_score:", grid_search.best_score_)
	joblib.dump(grid_search.best_estimator_, "tfidfpipe.pkl")


	class TlinReg(LinearRegression):

	def __init__(self, min_df=0.005,max_df=0.8,args, *kwargs):
	self.min_df = min_df
	self.max_df = max_df
	return super(TlinReg, self).__init__(args, *kwargs)

	def fit(self, X, y, min_df=0.005,max_df=0.8, args, *kwargs):
	# Train the model using the training sets
	vect = CountVectorizer(stop_words='english', min_df=self.min_df, max_df=self.max_df, max_features=4500, ngram_range=(2,2))
	vect.fit([e['text'] for e in X])
	self.vocabulary_ = vect.vocabulary_
	super(TlinReg, self).fit(vect.transform(e['text'] for e in X), y, args, *kwargs)
	return self

	def predict(self,X, args, *kwargs):
	newvect = CountVectorizer(ngram_range=(2,2), vocabulary=self.vocabulary_)
	return super(TlinReg, self).predict(newvect.transform(e['text'] for e in X), args, *kwargs)

	bimodel = TlinReg()
	params = dict(min_df=[0.005,0.010,0.015], max_df=[0.8,0.9,0.95,1.0])
	grid_search = GridSearchCV(bimodel, param_grid=params, n_jobs=-1)
	grid_search.fit(entries, stars)
	print("best_params:",grid_search.best_params_)
	print("grid_scores:", grid_search.grid_scores)
	print("best_score:", grid_search.best_score_)
	joblib.dump(grid_search.best_estimator_, "bipipe.pkl")