Last active
February 20, 2018 13:27
-
-
Save pattoM/9054744765c747c3453bb33c72b117ec to your computer and use it in GitHub Desktop.
Code for assignment - Lalonde 3 ways
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Load library and data | |
library("Matching") | |
data("lalonde") | |
#Separate data into subsets based on the degree status | |
nodegree <- subset(lalonde, nodegr == 1) | |
degree <- subset(lalonde, nodegr == 0) | |
#Make scatterplot of the data | |
plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree") | |
#abline(nodegreemod) | |
#Linear model fitting | |
nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree) | |
summary(nodegreemod) | |
#Estimating confidence intervals | |
confint(nodegreemod) | |
#Part 2 of the problem | |
#Make scatterplot of the data | |
plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree") | |
#abline(nodegreemod) | |
#Linear model fitting | |
degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree) | |
summary(degreemod) | |
#Estimating confidence intervals | |
confint(degreemod) | |
#QUESTION 2 | |
#Create the interaction variable | |
#Centering | |
#nodegrc <- nodegr-mean(nodegr) | |
#treatc <- treat-mean(treat) | |
#creating actual interaction term by multiplication | |
#degtreat <- nodegrc*treatc | |
#Finalize the model with the interaction term | |
interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde) | |
summary(interactionmodel) | |
#Confidence interval | |
confint(interactionmodel, level=0.95) | |
#Sim function to simulate uncertainty | |
interactionmodelsim <- sim(interactionmodel) | |
coef(interactionmodelsim) | |
#mean of the columns | |
meanofcoeff <- apply(coef(interactionmodelsim), 2, mean) #Means of the simulated coefficients | |
sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients | |
#95 percent confidence interval is 1.96SD from the mean therefore: | |
low.bound <- meanofcoeff - 1.96*sdofcoeff | |
low.bound | |
upper.bound <- meanofcoeff + 1.96*sdofcoeff | |
upper.bound | |
#Data visualization | |
#install.packages("sjPlot") | |
library(sjPlot) | |
sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T) | |
sjp.int(interactionmodel, show.values = T, show.ci = T) | |
#QUESTION 3 | |
#Since we will be running a logistic regression, we need re78 to be binary therefore | |
lalonde$u78[lalonde$re78<=0] <- 0 | |
lalonde$u78[lalonde$re78>0] <- 1 | |
#Separating data into subsets based on degree status | |
logistic.nodegree <- subset(lalonde, nodegr == 1) | |
logistic.degree <- subset(lalonde, nodegr == 0) | |
#Running logistic regression when nodegree = true | |
logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial) | |
summary(logisticmod1) | |
confint(logisticmod1) | |
#Logistic regression where nodegree = false | |
logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial) | |
summary(logisticmod2) | |
confint(logisticmod2) | |
#PROBLEM 4 | |
#Guidance From https://datascienceplus.com/random-forests-in-r/ | |
require(randomForest) | |
set.seed(1) | |
#check dimensions for lalonde | |
dim(lalonde) | |
#The random forest will incorporate all predictors | |
#Separating training and test sets | |
trainset=sample(1:nrow(lalonde),300, replace = TRUE) | |
lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE) | |
lalonde.rf | |
plot(lalonde.rf) | |
#Comparing out of bag sample errors and test set errors | |
oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde | |
test.err=double(12) | |
#mtry is no of Variables randomly chosen at each split. We hope to see no with least error | |
for(mtry in 1:12) | |
{ | |
rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300) | |
oob.err[mtry] = rf$mse[300] #Error of all Trees fitted | |
pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree | |
test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error | |
cat(mtry," ") #printing the output to the console | |
} | |
test.err | |
oob.err | |
matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split") | |
legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue")) | |
varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment