Created
September 22, 2016 03:04
-
-
Save stormxuwz/05ace10b6ab3e1f28493c68d65ad4350 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
randomForest_clf <- function(matTrain,trainLabel,matTest,...){ | |
library(randomForest) | |
rf_model<-randomForest(matTrain,trainLabel,ntree=500) | |
rf_predict<-predict(rf_model,matTest) | |
return(rf_predict) | |
} | |
xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){ | |
library(xgboost) | |
library(SnowballC) | |
# create label to index mapping | |
uniqueLabel <- unique(trainLabel) | |
nclass <- length(uniqueLabel) | |
indexIngredientMapping <- c(1:nclass)-1 | |
names(indexIngredientMapping) <- uniqueLabel | |
dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)]) | |
xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass) | |
xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest)) | |
xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1]) | |
return(xgboost_predict) | |
} | |
xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){ | |
library(xgboost) | |
library(SnowballC) | |
# create label to index mapping | |
uniqueLabel <- unique(trainLabel) | |
nclass <- length(uniqueLabel) | |
indexIngredientMapping <- c(1:nclass)-1 | |
names(indexIngredientMapping) <- uniqueLabel | |
dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)]) | |
xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass) | |
xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest)) | |
xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1]) | |
return(xgboost_predict) | |
} | |
knn_clf <- function(matTrain,trainLabel,matTest,...){ | |
library(caret) | |
knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE) | |
return(knn_predict) | |
} | |
neuralnet_clf <- function(matTrain,trainLabel,matTest,...){ | |
library(neuralnet) | |
newY <- as.data.frame(model.matrix(~-1+trainLabel)) | |
matTrain <- cbind(matTrain,newY) | |
n <- names(matTrain) | |
f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + "))) | |
nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T) | |
nn_predict <- compute(nn,matTest)$net.result | |
colnames(nn_predict) <- substring(names(newY),11) | |
final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)] | |
return(final) | |
} | |
clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){ | |
finalResult <- data.frame(id=1:nrow(matTest)) | |
for(clf in methods){ | |
print(paste("doing",clf)) | |
expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="") | |
prediction <- eval(parse(text=expre)) | |
finalResult[clf] <- prediction | |
# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE) | |
} | |
return(finalResult) | |
} | |
svm_clf_radial <- function(matTrain,trainLabel,matTest,...){ | |
library( 'e1071' ) | |
model <- svm(matTrain,trainLabel,kernel="radial",degree=3) | |
res <- predict( model, newdata=matTest) | |
return(res) | |
} | |
svm_linear_clf <- function(matTrain,trainLabel,matTest,...){ | |
library( 'e1071' ) | |
model <- svm(matTrain,trainLabel,kernel="linear") | |
res <- predict( model, newdata=matTest) | |
return(res) | |
} | |
navieBaye_clf <- function(matTrain,trainLabel,matTest,...){ | |
library( 'e1071' ) | |
model <- naiveBayes(matTrain,trainLabel) | |
res <- predict( model, newdata=matTest) | |
return(res) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
library(jsonlite) | |
library(caret) | |
setwd("~/Developer/cooking") | |
source("./code/preprocessing.R") | |
source("./code/classifier.R") | |
source("./code/featureEngineering.R") | |
train_raw <- fromJSON("./data/train.json", flatten = TRUE) | |
test_raw <- fromJSON("./data/test.json", flatten = TRUE) | |
trainLabel <- as.factor(train_raw$cuisine) | |
createSplit <- function(p=0.8){ | |
splitList=list() | |
for(i in 1:5){ | |
train.index <- createDataPartition(trainLabel, p = p, list = FALSE) | |
splitList[[i]]=train.index | |
} | |
saveRDS(splitList,"./data/cv_trainIndex.rds") | |
} | |
cv_crossValidation <- function(type,featureNum){ | |
tmp <- readRDS(paste("dataSet_final_",type,".rds",sep="")) | |
trainLabel <- tmp$trainLabel | |
feature <- tmp$feature | |
rm(tmp) | |
matTrain<-feature$trainFeature | |
rm(feature) | |
splitList <- readRDS("./data/cv_trainIndex.rds") | |
featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep="")) | |
# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial") | |
methodList <- c("xgboost_clf_200") | |
scores <- matrix(0,5,length(methodList)) | |
colnames(scores) <- methodList | |
for(i in 1:5){ | |
trainIndex <- splitList[[i]] | |
subTrain <- matTrain[trainIndex,featureIndex] | |
subTest <- matTrain[-trainIndex,featureIndex] | |
subTrainLabel <- trainLabel[trainIndex] | |
subTestLabel <- trainLabel[-trainIndex] | |
oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest, | |
methods=methodList) | |
for(j in 1:length(methodList)){ | |
scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel) | |
} | |
} | |
return(scores) | |
} | |
cv_main <- function(){ | |
word_score_smallFeature <- cv_crossValidation("word","small") | |
word_score_largeFeature <- cv_crossValidation("word","large") | |
phrase_score_smallFeature <- cv_crossValidation("phrase","small") | |
phrase_score_largeFeature <- cv_crossValidation("phrase","large") | |
saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds") | |
write.table(word_score_largeFeature,"word_score_largeFeature.csv") | |
write.table(word_score_smallFeature,"word_score_smallFeature.csv") | |
write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv") | |
write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv") | |
} | |
# run | |
cv_main() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
createWordFeature <- function(trainData,testData){ | |
c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients))) | |
c_ingredientsDTM <- DocumentTermMatrix(c_ingredients) | |
c_ingredientsDTM <- as.matrix(c_ingredientsDTM) | |
c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0) | |
c_ingredientsDTM <- as.data.frame(c_ingredientsDTM) | |
# c_ingredientsDTM$ingredients_count <- rowSums(c_ingredientsDTM) | |
trainFeature<-c_ingredientsDTM[1:nrow(trainData),] | |
testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),] | |
trainFeature <- featureStemming(trainFeature) | |
testFeature <- featureStemming(testFeature) | |
# remove functional words | |
funcIndex <- names(trainFeature) %in% c("for","in","all","with") | |
trainFeature <- trainFeature[,!funcIndex] | |
testFeature <- testFeature[,!funcIndex] | |
# This part is important | |
return(list(trainFeature=trainFeature,testFeature=testFeature)) | |
} | |
createPhraseFeature <- function(trainData,testData){ | |
uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients))) | |
trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel)) | |
testFeature <- matrix(0,nrow(testData),length(uniqueLabel)) | |
colnames(trainFeature) <- uniqueLabel | |
colnames(testFeature) <- uniqueLabel | |
for(i in 1:nrow(trainData)){ | |
trainFeature[i,trainData$ingredients[[i]]]=1 | |
} | |
for(i in 1:nrow(testData)){ | |
testFeature[i,testData$ingredients[[i]]]=1 | |
} | |
# colFeatureCol <- colSums(trainFeature) | |
return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature))) | |
} | |
featureStemming <- function(data){ | |
library(SnowballC) | |
n <- nrow(data) | |
stemmedFeature <- wordStem(names(data)) | |
uniStemmed <- unique(stemmedFeature) | |
newData <- matrix(0,nrow=n,ncol=length(uniStemmed)) | |
colnames(newData) <- uniStemmed | |
newData<- as.data.frame(newData) | |
for(i in 1:length(stemmedFeature)){ | |
newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0) | |
} | |
return(newData) | |
} | |
createFeature<-function(trainData,testData,type="word"){ | |
if(type=="word"){ | |
featureList <- createWordFeature(trainData,testData) | |
}else if(type=="phrase"){ | |
featureList <- createPhraseFeature(trainData,testData) | |
} | |
trainFeature <- featureList$trainFeature | |
testFeature <- featureList$testFeature | |
# Do feature Selection | |
# print("Do Feature Selection") | |
# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi") | |
# trainFeature <- trainFeature[,featureIndex] | |
# testFeature <- testFeature[,featureIndex] | |
# # Do feature engineering | |
# print("Do Feature Engineering") | |
# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature)) | |
# testFeature <- cbind(testFeature,featureEngineering(testFeature)) | |
return(list(trainFeature=trainFeature,testFeature=testFeature)) | |
} | |
featureEngineering <- function(trainFeature){ | |
# adding ingredient num | |
numOfIngredients <- rowSums(trainFeature) | |
# adding color information | |
# adding the ratio of meet | |
additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords) | |
return(additionalFeature) | |
} | |
featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){ | |
library(entropy) | |
# feature is a data frame with each ingredients | |
n=ncol(trainFeature); | |
featureIndex <- 1:ncol(trainFeature) | |
featureColSum <- colSums(trainFeature) | |
frequencyFeature <- which(featureColSum>10) | |
if(type=="entropy"){ | |
entropy=rep(0,n); | |
for (i in 1:n){ | |
index=which(trainFeature[,i]==1); | |
entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log"); | |
} | |
featureIndex=which(entropy<thresh); | |
} | |
if(type=="mi"){ | |
mi=rep(0,n); | |
for (i in 1:n){ | |
mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log"); | |
} | |
featureIndex=which(mi>thresh); | |
} | |
featureIndex <- intersect(featureIndex,frequencyFeature) | |
return(featureIndex) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
library(jsonlite) | |
setwd("~/Developer/cooking") | |
source("./code/preprocessing.R") | |
source("./code/classifier.R") | |
source("./code/featureEngineering.R") | |
main <- function(test=FALSE,createNew=FALSE,type="word"){ | |
print("reading raw data") | |
train_raw <- fromJSON("./data/train.json", flatten = TRUE) | |
test_raw <- fromJSON("./data/test.json", flatten = TRUE) | |
if(test){ | |
tmp <- readRDS("test.RDS") | |
matTrain <- tmp[[1]] | |
matTest <- tmp[[2]] | |
trainLabel <- as.factor(as.character(tmp[[3]])) | |
finalResult <- clf_controller(matTrain,trainLabel,matTest, | |
methods=c("knn_clf","neuralnet_clf","xgboost_clf","randomForest_clf")) | |
print(finalResult) | |
write.table(finalResult,file="test.csv",sep=",",row.names = FALSE) | |
} | |
else{ | |
if(createNew){ | |
print("preprocessing data") | |
train_raw <- preprocessing(train_raw) | |
test_raw <- preprocessing(test_raw) | |
saveRDS(list(train_raw,test_raw),"preprocessed.rds") | |
trainLabel<-as.factor(train_raw$cuisine) | |
testid <- test_raw$id | |
print("creating features") | |
feature <- createFeature(train_raw,test_raw,type) | |
print("saving files") | |
saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_",type,".rds",sep="")) | |
tmp <- readRDS(paste("dataSet_",type,".rds",sep="")) | |
feature <- tmp$feature | |
matTrain<-feature$trainFeature | |
matTest<-feature$testFeature | |
Train_numOfIngredients <- unlist(lapply(train_raw$ingredients,length)) | |
Test_numOfIngredients <- unlist(lapply(test_raw$ingredients,length)) | |
if(type=="word") | |
featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.5),type="mi") | |
else | |
featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.8),type="mi") | |
cat("select",length(featureIndex),"from original feature num",ncol(matTrain)) | |
saveRDS(featureIndex,paste("./data/",type,"_featureIndex_small.rds",sep="")) | |
saveRDS(c(1:(ncol(matTrain)+1)),paste("./data/",type,"_featureIndex_large.rds",sep="")) | |
# feature Engineering | |
feature$trainFeature$numOfIngredient <- Train_numOfIngredients | |
feature$testFeature$numOfIngredient <- Test_numOfIngredients | |
saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_final_",type,".rds",sep="")) | |
}else{ | |
print("reading the saved data") | |
tmp <- readRDS(paste("dataSet_final_",type,".rds",sep="")) | |
trainLabel <- tmp$trainLabel | |
testid <- tmp$testid | |
feature <- tmp$feature | |
rm(tmp) | |
matTrain<-feature$trainFeature | |
matTest<-feature$testFeature | |
rm(feature) | |
featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep="")) | |
matTrain <- matTrain[,featureIndex] | |
matTest <- matTest[,featureIndex] | |
methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial") | |
finalResult <- clf_controller(matTrain,trainLabel,matTest, | |
methods=methodList) | |
finalResult$id <- testid | |
write.table(finalResult,file="final.csv",sep=",",row.names = FALSE) | |
} | |
} | |
# if(test){ | |
# matTrain <- matTrain[1:100,1:100] | |
# matTest <- matTest[1:100,1:100] | |
# trainLabel <- trainLabel[1:100] | |
# saveRDS(list(matTrain,matTest,trainLabel),"test.RDS") | |
# } | |
} | |
# main(test=FALSE,createNew=TRUE,"word") | |
main(test=FALSE,createNew=TRUE,"phrase") | |
# main(test=FALSE,createNew=FALSE,"word") | |
# Train_colorNum <- rowSums(matTrain[,c("red","yellow","black","green","white","blue","purpl")]) | |
# Train_numOfWords <- rowSums(matTrain) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
preprocessing<-function(data){ | |
data$ingredients<-lapply(data$ingredients, FUN=tolower) | |
# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x)) | |
# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x)) | |
data$ingredients<-lapply(data$ingredients, FUN=function(x){ | |
x <- gsub("-", " ", x); | |
x <- gsub("_", " ", x); | |
x <- gsub("low fat", "low_fat", x); | |
x <- gsub("all purpose", "all_purpose", x); | |
x <- gsub("reduced fat","reduced_fat",x); | |
# x <- gsub("black pepper","black_pepper",x); | |
x <- gsub("olive oil","olive_oil",x); | |
# x <- gsub("country style","country_style",x); | |
x <- gsub("no calorie","no_calorie",x); | |
x <- gsub("no salt added","no_salt_added",x); | |
x <- gsub("greekstyl","greek style",x); | |
x <- gsub("not low fat","fat",x); | |
x <- gsub("dress","dressing",x); | |
x <- gsub("kraft","",x); | |
x <- gsub("color","colour",x); | |
# x <- gsub("ice cream","ice_cream",x); | |
x <- gsub(" ic "," ice ",x); | |
x <- gsub("lowfat","low_fat",x); | |
x <- gsub("knorr homestyl stock","",x); | |
x <- gsub("half & half","half_half",x); | |
x <- gsub("gluten free","gluten_free",x); | |
x <- gsub("long grain","long_grain",x); | |
x <- gsub("extra virgin","extra_virgin",x); | |
x <- gsub("low sodium","low_sodium",x); | |
x <- gsub("canola oil","canola_oil",x); | |
x <- gsub("five spice","five_spice",x); | |
x <- gsub("free range","free_range",x); | |
x <- gsub("fat free","fat_free",x); | |
x <- gsub("cream of coconut","coconut cream",x); | |
x <- gsub("^(.*?)®","",x); | |
x <- gsub("[^a-z_ ]", "", x); | |
x <- gsub("^ ","",x); | |
return(x); | |
}) | |
# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))}) | |
return(data) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment