Created
August 22, 2016 02:25
-
-
Save bennytowns/8741bbf4898f0516fb70c0017b13be60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stats) | |
library(VIM) | |
library(mice) | |
library(car) | |
library(Hmisc) | |
library(dplyr) | |
setwd('~/R') | |
#import and prepare the master data. | |
full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE) | |
row.names(full_golf)=paste0(full_golf$name,full_golf$year) | |
full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))] | |
golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,] | |
golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2 | |
summary(golf_fedex) | |
aggr(golf_fedex) #Lots of missing data, but is it relevant data? Get a good subset of data first | |
######################################################################################################## | |
################################Shots Gained v fedex Earned############################################# | |
######################################################################################################## | |
#Prepare the dataset | |
sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')] | |
sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')] | |
sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')] | |
sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),] | |
aggr(sg_data) | |
md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop | |
#because this data only recorded for top 200 players, the others will be OUTSIDE of the other data | |
sg_data_complete=sg_data[which(complete.cases(sg_data)),] | |
sg_test_complete=sg_test[which(complete.cases(sg_test)),] | |
aggr(sg_data_complete) | |
summary(sg_data_complete) #Data is prepped for testing | |
#Fit a multiple linear regression | |
sg.saturated=lm(rk_fedex~.,sg_data_complete) | |
summary(sg.saturated) | |
vif(sg.saturated) | |
avPlots(sg.saturated) | |
plot(sg.saturated) | |
#Better Box-Cox transform this | |
sg_sat.bc=boxCox(sg.saturated) | |
lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))] | |
sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda | |
sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete) | |
summary(sg_sat_mod.bc) | |
plot(sg_sat_mod.bc) | |
vif(sg_sat_mod.bc) | |
avPlots(sg_sat_mod.bc) | |
#Assumptions look good | |
#Looks like multiple linear regression model may hold. | |
#But, let's see if we need all of these variables... | |
#prepare less fit models | |
model.empty=lm(rk_fedex.bc~1,sg_data_complete) | |
model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete) | |
scope=list(lower = formula(model.empty), upper = formula(model.full)) | |
forwardAIC = step(model.empty, scope, direction = "forward", k = 2) | |
backwardAIC = step(model.full, scope, direction = "backward", k = 2) | |
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2) | |
bothAIC.full = step(model.full, scope, direction = "both", k = 2) | |
#Stepwise regression using BIC as the criteria (the penalty k = log(n)). | |
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196)) | |
backwardBIC = step(model.full, scope, direction = "backward", k = log(196)) | |
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196)) | |
bothBIC.full = step(model.full, scope, direction = "both", k = log(196)) | |
#Looks like combination of the four stats will work fine, let's predict 2016 output | |
fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence') | |
sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda | |
sg_test_actual=cbind(sg_test_actual,fedex_predict_2016) | |
sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit | |
sg_test_actual$error_sq=sg_test_actual$fit_error**2 | |
sg_total_error=sum(sg_test_actual$error_sq) | |
#LET'S MAKE OUR OWN MODEL | |
######################################################################################################## | |
##############################Other Variables v fedex Earned############################################ | |
######################################################################################################## | |
#prepare a data set | |
raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')] | |
raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')] | |
raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')] | |
aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges. | |
raw_data=raw_data[which(complete.cases(raw_data)),] | |
raw_test_complete=raw_test[which(complete.cases(raw_test)),] | |
aggr(raw_data) | |
#fit multiple linear model on all data | |
td_mod = lm(rk_fedex~.,raw_data) | |
summary(td_mod) | |
td_mod_summary = summary(td_mod) | |
avPlots(td_mod) | |
plot(td_mod) #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform | |
#First, let's build a model with reduced variable set and see how that looks. | |
##########################MAKE MY OWN REDUCED MODEL################################ | |
td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data) | |
summary(td_mod_red) | |
plot(td_mod_red) | |
vif(td_mod_red) | |
#Box Cox transform | |
td_mod.bc=boxCox(td_mod) | |
lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))] | |
raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda | |
td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data) | |
summary(td_mod.bc) | |
plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables | |
#Test Box Cox transformed dependent variable on the reduced variable set | |
td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data) | |
summary(td_mod_red.bc) | |
plot(td_mod_red.bc) | |
vif(td_mod_red.bc) | |
avPlots(td_mod_red.bc) | |
#Looks like a decent model, but let's see if there's a "best model" | |
#Check full vs reduced | |
AIC(td_mod_red.bc,td_mod.bc) | |
BIC(td_mod_red.bc,td_mod.bc) | |
#AIC and BIC don't show a clear advantage | |
#Alternatively, let's do a stepwise regression and see what set of variables is identified | |
model.empty=lm(rk_fedex.bc~1,raw_data) | |
model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data) | |
scope=list(lower = formula(model.empty), upper = formula(model.full)) | |
forwardAIC = step(model.empty, scope, direction = "forward", k = 2) | |
backwardAIC = step(model.full, scope, direction = "backward", k = 2) | |
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2) | |
bothAIC.full = step(model.full, scope, direction = "both", k = 2) | |
#Stepwise regression using BIC as the criteria (the penalty k = log(n)). | |
forwardBIC = step(model.empty, scope, direction = "forward", k = log(196)) | |
backwardBIC = step(model.full, scope, direction = "backward", k = log(196)) | |
bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196)) | |
bothBIC.full = step(model.full, scope, direction = "both", k = log(196)) | |
#BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd | |
#after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity. Resulting variable set follows: | |
td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data) | |
summary(td_mod_red2.bc) | |
#Looks like resonably good model, but check assumptions | |
plot(td_mod_red2.bc) | |
vif(td_mod_red2.bc) | |
avPlots(td_mod_red2.bc) | |
###Which Model is best? | |
AIC(td_mod_red2.bc,sg_sat_mod.bc) | |
BIC(td_mod_red2.bc,sg_sat_mod.bc) | |
#Test to see how well it predicts | |
raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence')) | |
summary(raw_mod.predict) | |
sg_test_actual$raw_mod_predict=raw_mod.predict$fit | |
sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict | |
sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2 | |
#Compare the two models | |
mean(sg_test_actual$raw_error_sq) | |
sd(sg_test_actual$raw_error_sq) | |
mean(sg_test_actual$error_sq) | |
sd(sg_test_actual$error_sq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment