stormxuwz · September 22, 2016 03:04
diff --git a/classifier.R b/classifier.R
 randomForest_clf <- function(matTrain,trainLabel,matTest,...){
 	library(randomForest)
 	rf_model<-randomForest(matTrain,trainLabel,ntree=500)
 	rf_predict<-predict(rf_model,matTest)
 	return(rf_predict)
 }


 xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){
 	library(xgboost)
 	library(SnowballC)

 	# create label to index mapping
 	uniqueLabel <- unique(trainLabel)
 	nclass <- length(uniqueLabel)
 	indexIngredientMapping <- c(1:nclass)-1
 	names(indexIngredientMapping) <- uniqueLabel

 	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
 	xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass)
 	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
 	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
 	return(xgboost_predict)
 }

 xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){
 	library(xgboost)
 	library(SnowballC)

 	# create label to index mapping
 	uniqueLabel <- unique(trainLabel)
 	nclass <- length(uniqueLabel)
 	indexIngredientMapping <- c(1:nclass)-1
 	names(indexIngredientMapping) <- uniqueLabel

 	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
 	xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass)
 	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
 	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
 	return(xgboost_predict)
 }



 knn_clf <- function(matTrain,trainLabel,matTest,...){
 	library(caret)
 	knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE)
 	return(knn_predict)
 }


 neuralnet_clf <- function(matTrain,trainLabel,matTest,...){
 	library(neuralnet)
 	newY <- as.data.frame(model.matrix(~-1+trainLabel))
 	matTrain <- cbind(matTrain,newY)

 	n <- names(matTrain)
 	f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + ")))

 	nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T)
 	nn_predict <- compute(nn,matTest)$net.result

 	colnames(nn_predict) <- substring(names(newY),11)
 	final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)]
 	return(final)
 }



 clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){
 	finalResult <- data.frame(id=1:nrow(matTest))
 	for(clf in methods){
 		print(paste("doing",clf))
 		expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="")
 		prediction <- eval(parse(text=expre))
 		finalResult[clf] <- prediction
 		# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE)
 	}
 	return(finalResult)
 }

 svm_clf_radial <- function(matTrain,trainLabel,matTest,...){
 	library( 'e1071' )
 	model <- svm(matTrain,trainLabel,kernel="radial",degree=3)
 	res <- predict( model, newdata=matTest)
 	return(res)
 }

 svm_linear_clf <- function(matTrain,trainLabel,matTest,...){
 	library( 'e1071' )
 	model <- svm(matTrain,trainLabel,kernel="linear")
 	res <- predict( model, newdata=matTest)
 	return(res)
 }

 navieBaye_clf <- function(matTrain,trainLabel,matTest,...){
 	library( 'e1071' )
 	model <- naiveBayes(matTrain,trainLabel)
 	res <- predict( model, newdata=matTest)
 	return(res)
 }
diff --git a/crossVal.R b/crossVal.R
 rm(list=ls())
 library(jsonlite)
 library(caret)

 setwd("~/Developer/cooking")
 source("./code/preprocessing.R")
 source("./code/classifier.R")
 source("./code/featureEngineering.R")

 train_raw  <- fromJSON("./data/train.json", flatten = TRUE)
 test_raw <- fromJSON("./data/test.json", flatten = TRUE)
 trainLabel <- as.factor(train_raw$cuisine)

 createSplit <- function(p=0.8){
 	splitList=list()
 	for(i in 1:5){
 		train.index <- createDataPartition(trainLabel, p = p, list = FALSE)
 		splitList[[i]]=train.index
 	}

 	saveRDS(splitList,"./data/cv_trainIndex.rds")
 }


 cv_crossValidation <- function(type,featureNum){
 	tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
 	trainLabel <- tmp$trainLabel
 	feature <- tmp$feature
 	rm(tmp)

 	matTrain<-feature$trainFeature
 	rm(feature)

 	splitList <- readRDS("./data/cv_trainIndex.rds")
 	featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))


 	# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
 	methodList <- c("xgboost_clf_200")
 	scores <- matrix(0,5,length(methodList))
 	colnames(scores) <- methodList

 	for(i in 1:5){
 		trainIndex <- splitList[[i]]
 		subTrain <- matTrain[trainIndex,featureIndex]
 		subTest <- matTrain[-trainIndex,featureIndex]
 		subTrainLabel <- trainLabel[trainIndex]
 		subTestLabel <- trainLabel[-trainIndex]
 		
 		oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest,
 				methods=methodList)
 		for(j in 1:length(methodList)){
 			scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel)
 		}
 	}
 	
 	return(scores)
 }

 cv_main <- function(){
 	word_score_smallFeature <- cv_crossValidation("word","small")
 	word_score_largeFeature <- cv_crossValidation("word","large")

 	phrase_score_smallFeature <- cv_crossValidation("phrase","small")
 	phrase_score_largeFeature <- cv_crossValidation("phrase","large")

 	saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds")

 	write.table(word_score_largeFeature,"word_score_largeFeature.csv")
 	write.table(word_score_smallFeature,"word_score_smallFeature.csv")
 	write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv")
 	write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv")

 }

 # run 
 cv_main()

diff --git a/featureEngineering.R b/featureEngineering.R
 library(tm)


 createWordFeature <- function(trainData,testData){
 	c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients)))
 	c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
 	c_ingredientsDTM <- as.matrix(c_ingredientsDTM)
 	c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0)
 	c_ingredientsDTM <- as.data.frame(c_ingredientsDTM)

 	# c_ingredientsDTM$ingredients_count  <- rowSums(c_ingredientsDTM)
 	trainFeature<-c_ingredientsDTM[1:nrow(trainData),]
 	testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),]

 	trainFeature <- featureStemming(trainFeature)
 	testFeature <- featureStemming(testFeature)

 	# remove functional words
 	funcIndex <- names(trainFeature) %in% c("for","in","all","with")
 	trainFeature <- trainFeature[,!funcIndex]
 	testFeature <- testFeature[,!funcIndex]

 	# This part is important
 	return(list(trainFeature=trainFeature,testFeature=testFeature))
 }


 createPhraseFeature <- function(trainData,testData){
 	uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients)))
 	trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel))
 	testFeature <- matrix(0,nrow(testData),length(uniqueLabel))

 	colnames(trainFeature) <- uniqueLabel
 	colnames(testFeature) <- uniqueLabel

 	for(i in 1:nrow(trainData)){
 		trainFeature[i,trainData$ingredients[[i]]]=1
 	}

 	for(i in 1:nrow(testData)){
 		testFeature[i,testData$ingredients[[i]]]=1
 	}

 	# colFeatureCol <- colSums(trainFeature)

 	return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature)))
 }


 featureStemming <- function(data){
 	library(SnowballC)
 	n <- nrow(data)
 	stemmedFeature <- wordStem(names(data))
 	uniStemmed <- unique(stemmedFeature)

 	newData <- matrix(0,nrow=n,ncol=length(uniStemmed))
 	colnames(newData) <- uniStemmed
 	
 	newData<- as.data.frame(newData)
 	
 	for(i in 1:length(stemmedFeature)){
 		newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0)
 	}
 	return(newData)
 }


 createFeature<-function(trainData,testData,type="word"){
 	if(type=="word"){
 		featureList <- createWordFeature(trainData,testData)
 	}else if(type=="phrase"){
 		featureList <- createPhraseFeature(trainData,testData)
 	}

 	trainFeature <- featureList$trainFeature
 	testFeature <- featureList$testFeature

 	# Do feature Selection
 	# print("Do Feature Selection")
 	# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi")

 	# trainFeature <- trainFeature[,featureIndex]
 	# testFeature <- testFeature[,featureIndex]

 	# # Do feature engineering
 	# print("Do Feature Engineering")
 	# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature))
 	# testFeature <- cbind(testFeature,featureEngineering(testFeature))

 	return(list(trainFeature=trainFeature,testFeature=testFeature))
 }


 featureEngineering <- function(trainFeature){
 	# adding ingredient num
 	numOfIngredients <- rowSums(trainFeature)

 	# adding color information

 	# adding the ratio of meet


 	additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords)
 	return(additionalFeature)
 }




 featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){
 	library(entropy)
 	# feature is a data frame with each ingredients
 	n=ncol(trainFeature);

 	featureIndex <- 1:ncol(trainFeature)

 	featureColSum <- colSums(trainFeature)

 	frequencyFeature <- which(featureColSum>10)


 	if(type=="entropy"){
 		entropy=rep(0,n);
 		for (i in 1:n){
 		  index=which(trainFeature[,i]==1);
 		  entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log");
 		}
 		featureIndex=which(entropy<thresh);
 	}

 	if(type=="mi"){
 		mi=rep(0,n);
 		
 		for (i in 1:n){
 	  		mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log");
 		}
 		
 		featureIndex=which(mi>thresh);
 	}

 	featureIndex <- intersect(featureIndex,frequencyFeature)

 	return(featureIndex)
 }
diff --git a/main.R b/main.R
 rm(list=ls())

 library(jsonlite)
 setwd("~/Developer/cooking")
 source("./code/preprocessing.R")
 source("./code/classifier.R")
 source("./code/featureEngineering.R")

 main <- function(test=FALSE,createNew=FALSE,type="word"){
 	print("reading raw data")
 	train_raw  <- fromJSON("./data/train.json", flatten = TRUE)
 	test_raw <- fromJSON("./data/test.json", flatten = TRUE)
 	
 	if(test){
 		tmp <- readRDS("test.RDS")
 		matTrain <- tmp[[1]]
 		matTest <- tmp[[2]]
 		trainLabel <- as.factor(as.character(tmp[[3]]))
 		finalResult <- clf_controller(matTrain,trainLabel,matTest,
 			methods=c("knn_clf","neuralnet_clf","xgboost_clf","randomForest_clf"))
 		print(finalResult)
 		write.table(finalResult,file="test.csv",sep=",",row.names = FALSE)
 	}

 	else{
 		if(createNew){
 			print("preprocessing data")
 			train_raw <- preprocessing(train_raw)
 			test_raw <- preprocessing(test_raw)
 			saveRDS(list(train_raw,test_raw),"preprocessed.rds")

 			trainLabel<-as.factor(train_raw$cuisine)
 			testid <- test_raw$id

 			print("creating features")
 			feature <- createFeature(train_raw,test_raw,type)
 			
 			print("saving files")
 			saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_",type,".rds",sep=""))

 			
 			tmp <- readRDS(paste("dataSet_",type,".rds",sep=""))
 			feature <- tmp$feature
 			
 			matTrain<-feature$trainFeature
 			matTest<-feature$testFeature
 			
 			Train_numOfIngredients <- unlist(lapply(train_raw$ingredients,length))
 			Test_numOfIngredients <- unlist(lapply(test_raw$ingredients,length))
 			
 			if(type=="word")
 				featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.5),type="mi")
 			else
 				featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.8),type="mi")

 			cat("select",length(featureIndex),"from original feature num",ncol(matTrain))

 			saveRDS(featureIndex,paste("./data/",type,"_featureIndex_small.rds",sep=""))
 			saveRDS(c(1:(ncol(matTrain)+1)),paste("./data/",type,"_featureIndex_large.rds",sep=""))	

 			# feature Engineering
 			feature$trainFeature$numOfIngredient <- Train_numOfIngredients
 			feature$testFeature$numOfIngredient <- Test_numOfIngredients

 			saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_final_",type,".rds",sep=""))

 		}else{
 			print("reading the saved data")
 			tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
 			trainLabel <- tmp$trainLabel
 			testid <- tmp$testid
 			feature <- tmp$feature
 			rm(tmp)

 			matTrain<-feature$trainFeature
 			matTest<-feature$testFeature
 			rm(feature)

 			featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))
 			
 			matTrain <- matTrain[,featureIndex]
 			matTest <- matTest[,featureIndex]

 			methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")

 			finalResult <- clf_controller(matTrain,trainLabel,matTest,
 				methods=methodList)
 		
 			finalResult$id <- testid
 			write.table(finalResult,file="final.csv",sep=",",row.names = FALSE)
 		}

 		
 	}
 	
 	# if(test){
 	# 	matTrain <- matTrain[1:100,1:100]
 	# 	matTest <- matTest[1:100,1:100]
 	# 	trainLabel <- trainLabel[1:100]
 	# 	saveRDS(list(matTrain,matTest,trainLabel),"test.RDS")
 	# }
 }



 # main(test=FALSE,createNew=TRUE,"word")
 main(test=FALSE,createNew=TRUE,"phrase")
 # main(test=FALSE,createNew=FALSE,"word")


 			# Train_colorNum <- rowSums(matTrain[,c("red","yellow","black","green","white","blue","purpl")])
 			# Train_numOfWords <- rowSums(matTrain)
diff --git a/preprocessing.R b/preprocessing.R
 preprocessing<-function(data){
 	data$ingredients<-lapply(data$ingredients, FUN=tolower)
 	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x))
 	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x))

 	data$ingredients<-lapply(data$ingredients, FUN=function(x){
 		x <- gsub("-", " ", x);
 		x <- gsub("_", " ", x);
 		x <- gsub("low fat", "low_fat", x);
 		x <- gsub("all purpose", "all_purpose", x);
 		x <- gsub("reduced fat","reduced_fat",x);
 		# x <- gsub("black pepper","black_pepper",x);
 		x <- gsub("olive oil","olive_oil",x);
 		# x <- gsub("country style","country_style",x);
 		x <- gsub("no calorie","no_calorie",x);
 		x <- gsub("no salt added","no_salt_added",x);
 		x <- gsub("greekstyl","greek style",x);
 		x <- gsub("not low fat","fat",x);
 		x <- gsub("dress","dressing",x);
 		x <- gsub("kraft","",x);
 		x <- gsub("color","colour",x);
 		# x <- gsub("ice cream","ice_cream",x);
 		x <- gsub(" ic "," ice ",x);
 		x <- gsub("lowfat","low_fat",x);
 		x <- gsub("knorr homestyl stock","",x);
 		x <- gsub("half & half","half_half",x);
 		x <- gsub("gluten free","gluten_free",x);
 		x <- gsub("long grain","long_grain",x);
 		x <- gsub("extra virgin","extra_virgin",x);
 		x <- gsub("low sodium","low_sodium",x);
 		x <- gsub("canola oil","canola_oil",x);
 		x <- gsub("five spice","five_spice",x);
 		x <- gsub("free range","free_range",x);
 		x <- gsub("fat free","fat_free",x);
 		x <- gsub("cream of coconut","coconut cream",x);
 		x <- gsub("^(.*?)®","",x);
 		x <- gsub("[^a-z_ ]", "", x);
 		x <- gsub("^ ","",x);

 		return(x);
 	})

 	# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))})
 	
 	return(data)
 }
	randomForest_clf <- function(matTrain,trainLabel,matTest,...){
	library(randomForest)
	rf_model<-randomForest(matTrain,trainLabel,ntree=500)
	rf_predict<-predict(rf_model,matTest)
	return(rf_predict)
	}


	xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
	}

	xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
	}



	knn_clf <- function(matTrain,trainLabel,matTest,...){
	library(caret)
	knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE)
	return(knn_predict)
	}


	neuralnet_clf <- function(matTrain,trainLabel,matTest,...){
	library(neuralnet)
	newY <- as.data.frame(model.matrix(~-1+trainLabel))
	matTrain <- cbind(matTrain,newY)

	n <- names(matTrain)
	f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + ")))

	nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T)
	nn_predict <- compute(nn,matTest)$net.result

	colnames(nn_predict) <- substring(names(newY),11)
	final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)]
	return(final)
	}



	clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){
	finalResult <- data.frame(id=1:nrow(matTest))
	for(clf in methods){
	print(paste("doing",clf))
	expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="")
	prediction <- eval(parse(text=expre))
	finalResult[clf] <- prediction
	# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE)
	}
	return(finalResult)
	}

	svm_clf_radial <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="radial",degree=3)
	res <- predict( model, newdata=matTest)
	return(res)
	}

	svm_linear_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="linear")
	res <- predict( model, newdata=matTest)
	return(res)
	}

	navieBaye_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- naiveBayes(matTrain,trainLabel)
	res <- predict( model, newdata=matTest)
	return(res)
	}
	rm(list=ls())
	library(jsonlite)
	library(caret)

	setwd("~/Developer/cooking")
	source("./code/preprocessing.R")
	source("./code/classifier.R")
	source("./code/featureEngineering.R")

	train_raw <- fromJSON("./data/train.json", flatten = TRUE)
	test_raw <- fromJSON("./data/test.json", flatten = TRUE)
	trainLabel <- as.factor(train_raw$cuisine)

	createSplit <- function(p=0.8){
	splitList=list()
	for(i in 1:5){
	train.index <- createDataPartition(trainLabel, p = p, list = FALSE)
	splitList[[i]]=train.index
	}

	saveRDS(splitList,"./data/cv_trainIndex.rds")
	}


	cv_crossValidation <- function(type,featureNum){
	tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
	trainLabel <- tmp$trainLabel
	feature <- tmp$feature
	rm(tmp)

	matTrain<-feature$trainFeature
	rm(feature)

	splitList <- readRDS("./data/cv_trainIndex.rds")
	featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))


	# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
	methodList <- c("xgboost_clf_200")
	scores <- matrix(0,5,length(methodList))
	colnames(scores) <- methodList

	for(i in 1:5){
	trainIndex <- splitList[[i]]
	subTrain <- matTrain[trainIndex,featureIndex]
	subTest <- matTrain[-trainIndex,featureIndex]
	subTrainLabel <- trainLabel[trainIndex]
	subTestLabel <- trainLabel[-trainIndex]

	oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest,
	methods=methodList)
	for(j in 1:length(methodList)){
	scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel)
	}
	}

	return(scores)
	}

	cv_main <- function(){
	word_score_smallFeature <- cv_crossValidation("word","small")
	word_score_largeFeature <- cv_crossValidation("word","large")

	phrase_score_smallFeature <- cv_crossValidation("phrase","small")
	phrase_score_largeFeature <- cv_crossValidation("phrase","large")

	saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds")

	write.table(word_score_largeFeature,"word_score_largeFeature.csv")
	write.table(word_score_smallFeature,"word_score_smallFeature.csv")
	write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv")
	write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv")

	}

	# run
	cv_main()
	library(tm)


	createWordFeature <- function(trainData,testData){
	c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients)))
	c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
	c_ingredientsDTM <- as.matrix(c_ingredientsDTM)
	c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0)
	c_ingredientsDTM <- as.data.frame(c_ingredientsDTM)

	# c_ingredientsDTM$ingredients_count <- rowSums(c_ingredientsDTM)
	trainFeature<-c_ingredientsDTM[1:nrow(trainData),]
	testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),]

	trainFeature <- featureStemming(trainFeature)
	testFeature <- featureStemming(testFeature)

	# remove functional words
	funcIndex <- names(trainFeature) %in% c("for","in","all","with")
	trainFeature <- trainFeature[,!funcIndex]
	testFeature <- testFeature[,!funcIndex]

	# This part is important
	return(list(trainFeature=trainFeature,testFeature=testFeature))
	}


	createPhraseFeature <- function(trainData,testData){
	uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients)))
	trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel))
	testFeature <- matrix(0,nrow(testData),length(uniqueLabel))

	colnames(trainFeature) <- uniqueLabel
	colnames(testFeature) <- uniqueLabel

	for(i in 1:nrow(trainData)){
	trainFeature[i,trainData$ingredients[[i]]]=1
	}

	for(i in 1:nrow(testData)){
	testFeature[i,testData$ingredients[[i]]]=1
	}

	# colFeatureCol <- colSums(trainFeature)

	return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature)))
	}


	featureStemming <- function(data){
	library(SnowballC)
	n <- nrow(data)
	stemmedFeature <- wordStem(names(data))
	uniStemmed <- unique(stemmedFeature)

	newData <- matrix(0,nrow=n,ncol=length(uniStemmed))
	colnames(newData) <- uniStemmed

	newData<- as.data.frame(newData)

	for(i in 1:length(stemmedFeature)){
	newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0)
	}
	return(newData)
	}


	createFeature<-function(trainData,testData,type="word"){
	if(type=="word"){
	featureList <- createWordFeature(trainData,testData)
	}else if(type=="phrase"){
	featureList <- createPhraseFeature(trainData,testData)
	}

	trainFeature <- featureList$trainFeature
	testFeature <- featureList$testFeature

	# Do feature Selection
	# print("Do Feature Selection")
	# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi")

	# trainFeature <- trainFeature[,featureIndex]
	# testFeature <- testFeature[,featureIndex]

	# # Do feature engineering
	# print("Do Feature Engineering")
	# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature))
	# testFeature <- cbind(testFeature,featureEngineering(testFeature))

	return(list(trainFeature=trainFeature,testFeature=testFeature))
	}


	featureEngineering <- function(trainFeature){
	# adding ingredient num
	numOfIngredients <- rowSums(trainFeature)

	# adding color information

	# adding the ratio of meet


	additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords)
	return(additionalFeature)
	}




	featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){
	library(entropy)
	# feature is a data frame with each ingredients
	n=ncol(trainFeature);

	featureIndex <- 1:ncol(trainFeature)

	featureColSum <- colSums(trainFeature)

	frequencyFeature <- which(featureColSum>10)


	if(type=="entropy"){
	entropy=rep(0,n);
	for (i in 1:n){
	index=which(trainFeature[,i]==1);
	entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log");
	}
	featureIndex=which(entropy<thresh);
	}

	if(type=="mi"){
	mi=rep(0,n);

	for (i in 1:n){
	mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log");
	}

	featureIndex=which(mi>thresh);
	}

	featureIndex <- intersect(featureIndex,frequencyFeature)

	return(featureIndex)
	}
	preprocessing<-function(data){
	data$ingredients<-lapply(data$ingredients, FUN=tolower)
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x))
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x))

	data$ingredients<-lapply(data$ingredients, FUN=function(x){
	x <- gsub("-", " ", x);
	x <- gsub("_", " ", x);
	x <- gsub("low fat", "low_fat", x);
	x <- gsub("all purpose", "all_purpose", x);
	x <- gsub("reduced fat","reduced_fat",x);
	# x <- gsub("black pepper","black_pepper",x);
	x <- gsub("olive oil","olive_oil",x);
	# x <- gsub("country style","country_style",x);
	x <- gsub("no calorie","no_calorie",x);
	x <- gsub("no salt added","no_salt_added",x);
	x <- gsub("greekstyl","greek style",x);
	x <- gsub("not low fat","fat",x);
	x <- gsub("dress","dressing",x);
	x <- gsub("kraft","",x);
	x <- gsub("color","colour",x);
	# x <- gsub("ice cream","ice_cream",x);
	x <- gsub(" ic "," ice ",x);
	x <- gsub("lowfat","low_fat",x);
	x <- gsub("knorr homestyl stock","",x);
	x <- gsub("half & half","half_half",x);
	x <- gsub("gluten free","gluten_free",x);
	x <- gsub("long grain","long_grain",x);
	x <- gsub("extra virgin","extra_virgin",x);
	x <- gsub("low sodium","low_sodium",x);
	x <- gsub("canola oil","canola_oil",x);
	x <- gsub("five spice","five_spice",x);
	x <- gsub("free range","free_range",x);
	x <- gsub("fat free","fat_free",x);
	x <- gsub("cream of coconut","coconut cream",x);
	x <- gsub("^(.*?)®","",x);
	x <- gsub("[^a-z_ ]", "", x);
	x <- gsub("^ ","",x);

	return(x);
	})

	# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))})

	return(data)
	}