Last active
December 9, 2016 21:37
-
-
Save daxaxelrod/433ea07ebbf6dbcc8a6562189b3a2ab5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(lattice) | |
library(caret) | |
library(dplyr) | |
train <- read.csv(file="housing_prices/train.csv", header = TRUE, sep = ",") | |
test <- read.csv(file="housing_prices/test.csv", header = TRUE, sep = ",") | |
dim(train) | |
summary(train) | |
names(train) | |
dim(test) | |
# If you are interested in a variable use some of the analyses below to see its distribution, outliers, etc. | |
cor(train$SalePrice, train$Fireplaces) | |
summary(aov(SalePrice ~ CentralAir, data=train)) | |
plot(log(train$YearBuilt), train$YearRemodAdd) | |
boxplot(train$LotFrontage) | |
histogram(train$YearBuilt) | |
summary(train$Neighborhood) | |
# If you want to remove the outliers, you can use the filter function of dplyr. | |
# Numbers below are as a result of visual inspection, not any statistical analyses. | |
trainNoOutlier <- filter(train, | |
SalePrice < 600000 , | |
LotArea < 100000 , | |
GrLivArea < 4000 , | |
GarageCars < 3 , | |
TotalBsmtSF < 3000 | |
) | |
dim(trainNoOutlier) | |
set.seed(314) | |
pp <- c("center", "scale") | |
ppo <- preProcess(train, "medianImpute") | |
train2 <- predict(ppo, train) | |
control <- trainControl(method="repeatedcv", number=6, repeats=10) | |
# glm doesn't have any fine tuning, but you need fine tuning for the project. | |
# Look at caret examples in class notes for tuneGrid. | |
# Every method has a different parameter to fine tune. Search the Internet to find how to tune your method in caret. | |
glm1 <- train(log(SalePrice) ~ sqrt(TotalBsmtSF) + sqrt(X1stFlrSF) + sqrt(GarageYrBlt) + sqrt(GarageArea), | |
data=train2, | |
method="glm", | |
preProcess=pp, | |
trControl=control | |
) | |
glm1 | |
# You can read about plotting residuals against predicted values. | |
# This actually gives you some insights of how good your model is going to be. | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
# submission | |
prediction <- predict(glm1, test2) | |
# Don't submit the predictions as the log of SalePrice. | |
# Kaggle will also take a log of what you submit and you'll get a bad score | |
predictionexp <- exp(prediction) | |
submission <- data.frame(Id = test2$Id, SalePrice = predictionexp) | |
write.csv(submission, file = "Mult_lm_plz_work.csv", row.names = FALSE) | |
library(rpart) | |
library(rpart.plot) | |
DTM1 <- rpart(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea, | |
data=train, | |
method="anova") | |
DTM1 | |
plot(DTM1) | |
text(DTM1, cex=0.7) #cex is for scaling the text so it shows on the plot | |
summary(DTM1) | |
plot(x=train$SalePrice, y= train$TotalBsmtSF) | |
plot(x=trainNoOutlier$SalePrice, y=trainNoOutlier$TotalBsmtSF) | |
first_lm <- lm(trainNoOutlier$SalePrice ~ trainNoOutlier$TotalBsmtSF) | |
abline(84279.55, 79.31, col="red") | |
plot(trainNoOutlier$SalePrice ~ trainNoOutlier$TotalBsmtSF) | |
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea | |
set.seed(100) | |
train_numeric <- data.frame(trainNoOutlier$SalePrice, trainNoOutlier$TotalBsmtSF) #trainNoOutlier$X1stFlrSF | |
train_numeric | |
housing_km <- kmeans(train_numeric, centers=5, nstart=20) | |
housing_km | |
#moderatly ok | |
plot(train_numeric, col=housing_km$cluster) | |
housing_km$centers | |
#####multiple linear reggr | |
# pre_processed_no_outlier <- preProcess(trainNoOutlier, "medianImpute") | |
#^^ returned 0?!?!? | |
library(RANN) | |
# ppo <- preProcess(trainNoOutlier, c("knnImpute")) | |
ppo <- preProcess(trainNoOutlier, "medianImpute") | |
train2 <- predict(ppo, trainNoOutlier) | |
housing_lm <- lm(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + MoSold + LotArea + BedroomAbvGr, train2) | |
summary(housing_lm) | |
housing_lm | |
plot(housing_lm$fitted.values, housing_lm$residuals) | |
qqnorm(housing_lm$fitted.values, lyab="Residual Quantiles") | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest, test) | |
housing_lm_predict <- predict(housing_lm, test2) | |
# housing_lm_exp <- exp(housing_lm_predict) #never took a log | |
housing_lm_predict | |
# submission | |
summary(housing_lm) | |
submission <- data.frame(Id = test$Id, SalePrice = housing_lm_predict) | |
write.csv(submission, file = "Mult_lm_plz_work4.csv", row.names = FALSE) | |
# random forest testing | |
library(hydroGOF) | |
train3 <- train | |
train3$GarageYrBlt[is.na(train$GarageYrBlt)] <- 0 | |
train3$MasVnrArea[is.na(train$MasVnrArea)] <- 0 | |
train3$LotFrontage[is.na(train$LotFrontage)] <- 0 | |
#Interactions based on correlation | |
train3$year_qual <- train$YearBuilt*train$OverallQual #overall condition | |
train3$year_r_qual <- train$YearRemodAdd*train$OverallQual #quality x remodel | |
train3$qual_bsmt <- train$OverallQual*train$TotalBsmtSF #quality x basement size | |
train3$livarea_qual <- train$OverallQual*train$GrLivArea #quality x living area | |
train3$qual_bath <- train$OverallQual*train$FullBath #quality x baths | |
train3$qual_ext <- train$OverallQual*train$exterior_cond #quality x exterior | |
library(randomForest) | |
ppo <- preProcess(trainNoOutlier, "medianImpute") | |
train2 <- predict(ppo, trainNoOutlier) | |
model_1 <- randomForest(SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + LotArea + BedroomAbvGr, data=train2) | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest, test) | |
prediction <- predict(model_1, test2) | |
model_output <- cbind(test2, prediction) | |
model_output$log_prediction <- log(model_output$prediction) | |
model_output$log_SalePrice <- log(model_output$SalePrice) | |
#Test with RMSE | |
rmse(model_output$log_SalePrice,model_output$log_prediction) | |
submission <- data.frame(Id = model_output$Id, SalePrice = model_output$prediction) | |
write.csv(submission, file = "Mult_lm_plz_work6.csv", row.names = FALSE) | |
# some new stuff with caret | |
#treebag | |
model_2 <- train( | |
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + LotArea + BedroomAbvGr, | |
data=train2, | |
method="treebag", | |
preProcess=pp, | |
trControl=control | |
) | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
prediction <- predict(model_2, test2) | |
submission3 <- data.frame(Id = test2$Id, SalePrice = prediction) | |
write.csv(submission3, file = "Mult_lm_plz_work7.csv", row.names = FALSE) | |
#different tree | |
model_3 <- train( | |
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + LotArea, | |
data=train2, | |
method="blackboost", | |
preProcess=pp, | |
trControl=control | |
) | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
prediction <- predict(model_3, test2) | |
#not working | |
#RMSE <- sqrt(mean((test$SalePrice-prediction)^2)) | |
submission3 <- data.frame(Id = test2$Id, SalePrice = prediction) | |
write.csv(submission3, file = "Mult_lm_plz_work8.csv", row.names = FALSE) | |
#using gbm | |
model_4 <- train( | |
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold, | |
data=train2, | |
method="gbm", | |
preProcess=pp, | |
trControl=control | |
) | |
ppotest <-preProcess(test,"medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
prediction <- predict(model_4, test2) | |
submission5 <- data.frame(Id = test2$Id, SalePrice = prediction) | |
write.csv(submission5, file = "Mult_lm_plz_work9.csv", row.names = FALSE) | |
### using log | |
ppo <- preProcess(trainNoOutlier, c("medianImpute")) | |
train2 <- predict(ppo, trainNoOutlier) | |
model_5 <- train( | |
log(SalePrice) ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + OverallQual, #log(SalePrice) ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF | |
data=train2, | |
method="gbm", | |
preProcess=pp, | |
trControl=control, | |
) | |
ppotest <-preProcess(test, "medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
prediction <- predict(model_5, test2) | |
predictionexp <- exp(prediction) | |
submission5 <- data.frame(Id = test2$Id, SalePrice = predictionexp) | |
write.csv(submission5, file = "Mult_lm_plz_work14.csv", row.names = FALSE) | |
# using a tuning grid | |
ppo <- preProcess(trainNoOutlier, c("medianImpute")) | |
train2 <- predict(ppo, trainNoOutlier) | |
gbmGrid <- expand.grid(interaction.depth = c(1, 3, 5, 9, 15), | |
n.trees = (1:30)*1, | |
shrinkage = 0.1, | |
n.minobsinnode = 10) | |
model_6 <- train( | |
SalePrice ~ TotalBsmtSF + X1stFlrSF + GarageYrBlt + GarageArea + YrSold + OverallQual, #log(SalePrice) ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF | |
data=train2, | |
method="gbm", | |
preProcess=pp, | |
trControl=control, | |
tuneGrid = gbmGrid | |
) | |
ppotest <-preProcess(test, "medianImpute") | |
test2 <- predict(ppotest,test) | |
plot(test2) | |
prediction <- predict(model_6, test2) | |
#predictionexp <- exp(prediction) | |
submission5 <- data.frame(Id = test2$Id, SalePrice = prediction) | |
write.csv(submission5, file = "Mult_lm_plz_work17.csv", row.names = FALSE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment