bennytowns · August 22, 2016 02:25
diff --git a/golf_fedex_more.R b/golf_fedex_more.R
 library(stats)
 library(VIM)
 library(mice)
 library(car)
 library(Hmisc)
 library(dplyr)
 setwd('~/R')

 #import and prepare the master data.
 full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE)
 row.names(full_golf)=paste0(full_golf$name,full_golf$year)
 full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))]
 golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,]
 golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2
 summary(golf_fedex)
 aggr(golf_fedex) #Lots of missing data, but is it relevant data?  Get a good subset of data first

 ########################################################################################################
 ################################Shots Gained v fedex Earned#############################################
 ########################################################################################################

 #Prepare the dataset
 sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
 sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')]
 sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
 sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),]
 aggr(sg_data)
 md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop
                    #because this data only recorded for top 200 players, the others will be OUTSIDE of the other data
 sg_data_complete=sg_data[which(complete.cases(sg_data)),]
 sg_test_complete=sg_test[which(complete.cases(sg_test)),]
 aggr(sg_data_complete)
 summary(sg_data_complete) #Data is prepped for testing

 #Fit a multiple linear regression
 sg.saturated=lm(rk_fedex~.,sg_data_complete)
 summary(sg.saturated)
 vif(sg.saturated)
 avPlots(sg.saturated)
 plot(sg.saturated)

 #Better Box-Cox transform this
 sg_sat.bc=boxCox(sg.saturated)
 lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))]
 sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda
 sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete)
 summary(sg_sat_mod.bc)
 plot(sg_sat_mod.bc)
 vif(sg_sat_mod.bc)
 avPlots(sg_sat_mod.bc)
 #Assumptions look good
 #Looks like multiple linear regression model may hold.
 #But, let's see if we need all of these variables...

 #prepare less fit models
 model.empty=lm(rk_fedex.bc~1,sg_data_complete)
 model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete)
 scope=list(lower = formula(model.empty), upper = formula(model.full))

 forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
 backwardAIC = step(model.full, scope, direction = "backward", k = 2)
 bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
 bothAIC.full = step(model.full, scope, direction = "both", k = 2)

 #Stepwise regression using BIC as the criteria (the penalty k = log(n)).
 forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
 backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
 bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
 bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

 #Looks like combination of the four stats will work fine, let's predict 2016 output
 fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence')
 sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda
 sg_test_actual=cbind(sg_test_actual,fedex_predict_2016)
 sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit
 sg_test_actual$error_sq=sg_test_actual$fit_error**2
 sg_total_error=sum(sg_test_actual$error_sq)


 #LET'S MAKE OUR OWN MODEL

 ########################################################################################################
 ##############################Other Variables v fedex Earned############################################
 ########################################################################################################

 #prepare a data set
 raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
 raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
 raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
 aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges.
 raw_data=raw_data[which(complete.cases(raw_data)),]
 raw_test_complete=raw_test[which(complete.cases(raw_test)),]
 aggr(raw_data)

 #fit multiple linear model on all data
 td_mod = lm(rk_fedex~.,raw_data)
 summary(td_mod)
 td_mod_summary = summary(td_mod)
 avPlots(td_mod)
 plot(td_mod)  #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform
 #First, let's build a model with reduced variable set and see how that looks.

 ##########################MAKE MY OWN REDUCED MODEL################################
 td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
 summary(td_mod_red)
 plot(td_mod_red)
 vif(td_mod_red)

 #Box Cox transform
 td_mod.bc=boxCox(td_mod)
 lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))]
 raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda
 td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data)
 summary(td_mod.bc)
 plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables

 #Test Box Cox transformed dependent variable on the reduced variable set
 td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
 summary(td_mod_red.bc)
 plot(td_mod_red.bc)
 vif(td_mod_red.bc)
 avPlots(td_mod_red.bc)
 #Looks like a decent model, but let's see if there's a "best model"

 #Check full vs reduced
 AIC(td_mod_red.bc,td_mod.bc)
 BIC(td_mod_red.bc,td_mod.bc)
 #AIC and BIC don't show a clear advantage

 #Alternatively, let's do a stepwise regression and see what set of variables is identified
 model.empty=lm(rk_fedex.bc~1,raw_data)
 model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data)
 scope=list(lower = formula(model.empty), upper = formula(model.full))

 forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
 backwardAIC = step(model.full, scope, direction = "backward", k = 2)
 bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
 bothAIC.full = step(model.full, scope, direction = "both", k = 2)

 #Stepwise regression using BIC as the criteria (the penalty k = log(n)).
 forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
 backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
 bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
 bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

 #BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd
 #after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity.  Resulting variable set follows:
 td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data)
 summary(td_mod_red2.bc)

 #Looks like resonably good model, but check assumptions
 plot(td_mod_red2.bc)
 vif(td_mod_red2.bc)
 avPlots(td_mod_red2.bc)

 ###Which Model is best?
 AIC(td_mod_red2.bc,sg_sat_mod.bc)
 BIC(td_mod_red2.bc,sg_sat_mod.bc)

 #Test to see how well it predicts
 raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence'))
 summary(raw_mod.predict)
 sg_test_actual$raw_mod_predict=raw_mod.predict$fit
 sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict
 sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2

 #Compare the two models
 mean(sg_test_actual$raw_error_sq)
 sd(sg_test_actual$raw_error_sq)
 mean(sg_test_actual$error_sq)
 sd(sg_test_actual$error_sq)
	library(stats)
	library(VIM)
	library(mice)
	library(car)
	library(Hmisc)
	library(dplyr)
	setwd('~/R')

	#import and prepare the master data.
	full_golf=read.csv('full_golf_new.csv',strip.white=T,stringsAsFactors=FALSE)
	row.names(full_golf)=paste0(full_golf$name,full_golf$year)
	full_golf=full_golf[,-which(names(full_golf) %in% c('X','name'))]
	golf_fedex = full_golf[is.na(full_golf$rk_fedex)==F,]
	golf_fedex$rk_fedex=golf_fedex$rk_fedex/golf_fedex$rk_fedex_variable2
	summary(golf_fedex)
	aggr(golf_fedex) #Lots of missing data, but is it relevant data? Get a good subset of data first

	########################################################################################################
	################################Shots Gained v fedex Earned#############################################
	########################################################################################################

	#Prepare the dataset
	sg_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test=golf_fedex[golf_fedex$year==2016,c('sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','sg_ott','sg_aptg','sg_artg','sg_putt')]
	sg_test_actual=sg_test_actual[which(complete.cases(sg_test_actual)),]
	aggr(sg_data)
	md.pattern(sg_data) #Missing Predictor Variables... not really imputable, so we won't test these, fair to drop
	#because this data only recorded for top 200 players, the others will be OUTSIDE of the other data
	sg_data_complete=sg_data[which(complete.cases(sg_data)),]
	sg_test_complete=sg_test[which(complete.cases(sg_test)),]
	aggr(sg_data_complete)
	summary(sg_data_complete) #Data is prepped for testing

	#Fit a multiple linear regression
	sg.saturated=lm(rk_fedex~.,sg_data_complete)
	summary(sg.saturated)
	vif(sg.saturated)
	avPlots(sg.saturated)
	plot(sg.saturated)

	#Better Box-Cox transform this
	sg_sat.bc=boxCox(sg.saturated)
	lambda = sg_sat.bc$x[which(sg_sat.bc$y == max(sg_sat.bc$y))]
	sg_data_complete$rk_fedex.bc = (sg_data_complete$rk_fedex^lambda - 1)/lambda
	sg_sat_mod.bc=lm(rk_fedex.bc~sg_ott +sg_aptg +sg_artg +sg_putt,sg_data_complete)
	summary(sg_sat_mod.bc)
	plot(sg_sat_mod.bc)
	vif(sg_sat_mod.bc)
	avPlots(sg_sat_mod.bc)
	#Assumptions look good
	#Looks like multiple linear regression model may hold.
	#But, let's see if we need all of these variables...

	#prepare less fit models
	model.empty=lm(rk_fedex.bc~1,sg_data_complete)
	model.full=lm(rk_fedex.bc~.-rk_fedex,sg_data_complete)
	scope=list(lower = formula(model.empty), upper = formula(model.full))

	forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
	backwardAIC = step(model.full, scope, direction = "backward", k = 2)
	bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
	bothAIC.full = step(model.full, scope, direction = "both", k = 2)

	#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
	forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
	backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
	bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
	bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

	#Looks like combination of the four stats will work fine, let's predict 2016 output
	fedex_predict_2016=predict(sg_sat_mod.bc,sg_test_complete,interval='confidence')
	sg_test_actual$rk_fedex.bc = (sg_test_actual$rk_fedex^lambda - 1)/lambda
	sg_test_actual=cbind(sg_test_actual,fedex_predict_2016)
	sg_test_actual$fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$fit
	sg_test_actual$error_sq=sg_test_actual$fit_error**2
	sg_total_error=sum(sg_test_actual$error_sq)


	#LET'S MAKE OUR OWN MODEL

	########################################################################################################
	##############################Other Variables v fedex Earned############################################
	########################################################################################################

	#prepare a data set
	raw_data=golf_fedex[golf_fedex$year!=2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	raw_test=golf_fedex[golf_fedex$year==2016,c('drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	raw_test_actual=golf_fedex[golf_fedex$year==2016,c('rk_fedex','drd','dra','gir','ssv','scr','pth','pthatg','pmd','ppr')]
	aggr(raw_data) #Again, the lower-ranked golfers, so hard to impute, since they all fall outside of the data ranges.
	raw_data=raw_data[which(complete.cases(raw_data)),]
	raw_test_complete=raw_test[which(complete.cases(raw_test)),]
	aggr(raw_data)

	#fit multiple linear model on all data
	td_mod = lm(rk_fedex~.,raw_data)
	summary(td_mod)
	td_mod_summary = summary(td_mod)
	avPlots(td_mod)
	plot(td_mod) #Definitely some relationship, but it's not really "linear", maybe we can make it so with box-cox transform
	#First, let's build a model with reduced variable set and see how that looks.

	##########################MAKE MY OWN REDUCED MODEL################################
	td_mod_red = lm(rk_fedex~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
	summary(td_mod_red)
	plot(td_mod_red)
	vif(td_mod_red)

	#Box Cox transform
	td_mod.bc=boxCox(td_mod)
	lambda = td_mod.bc$x[which(td_mod.bc$y == max(td_mod.bc$y))]
	raw_data$rk_fedex.bc = (raw_data$rk_fedex^lambda - 1)/lambda
	td_mod.bc=lm(rk_fedex.bc~.-rk_fedex,raw_data)
	summary(td_mod.bc)
	plot(td_mod.bc) #Looks like a reasonable model, let's try a reduced model based on significant variables

	#Test Box Cox transformed dependent variable on the reduced variable set
	td_mod_red.bc=lm(rk_fedex.bc~drd+dra+gir+ssv+pth+pthatg+pmd+ppr,raw_data)
	summary(td_mod_red.bc)
	plot(td_mod_red.bc)
	vif(td_mod_red.bc)
	avPlots(td_mod_red.bc)
	#Looks like a decent model, but let's see if there's a "best model"

	#Check full vs reduced
	AIC(td_mod_red.bc,td_mod.bc)
	BIC(td_mod_red.bc,td_mod.bc)
	#AIC and BIC don't show a clear advantage

	#Alternatively, let's do a stepwise regression and see what set of variables is identified
	model.empty=lm(rk_fedex.bc~1,raw_data)
	model.full=lm(rk_fedex.bc~.-rk_fedex,raw_data)
	scope=list(lower = formula(model.empty), upper = formula(model.full))

	forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
	backwardAIC = step(model.full, scope, direction = "backward", k = 2)
	bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
	bothAIC.full = step(model.full, scope, direction = "both", k = 2)

	#Stepwise regression using BIC as the criteria (the penalty k = log(n)).
	forwardBIC = step(model.empty, scope, direction = "forward", k = log(196))
	backwardBIC = step(model.full, scope, direction = "backward", k = log(196))
	bothBIC.empty = step(model.empty, scope, direction = "both", k = log(196))
	bothBIC.full = step(model.full, scope, direction = "both", k = log(196))

	#BIC identifies a subset, similar to the SG model statistics but includes some overlapping areas, dra, pmd
	#after reviewing VIF, we can safely remove dra and pmd on concerns about collinearity. Resulting variable set follows:
	td_mod_red2.bc=lm(rk_fedex.bc~drd+gir+pthatg+ppr,raw_data)
	summary(td_mod_red2.bc)

	#Looks like resonably good model, but check assumptions
	plot(td_mod_red2.bc)
	vif(td_mod_red2.bc)
	avPlots(td_mod_red2.bc)

	###Which Model is best?
	AIC(td_mod_red2.bc,sg_sat_mod.bc)
	BIC(td_mod_red2.bc,sg_sat_mod.bc)

	#Test to see how well it predicts
	raw_mod.predicrat=data.frame(predict(td_mod_red2.bc,raw_test_complete,interval='confidence'))
	summary(raw_mod.predict)
	sg_test_actual$raw_mod_predict=raw_mod.predict$fit
	sg_test_actual$raw_fit_error=sg_test_actual$rk_fedex.bc-sg_test_actual$raw_mod_predict
	sg_test_actual$raw_error_sq = sg_test_actual$raw_fit_error**2

	#Compare the two models
	mean(sg_test_actual$raw_error_sq)
	sd(sg_test_actual$raw_error_sq)
	mean(sg_test_actual$error_sq)
	sd(sg_test_actual$error_sq)