pattoM · February 20, 2018 13:27
diff --git a/Lalonde3ways.R b/Lalonde3ways.R
 #Load library and data
 library("Matching")
 data("lalonde")

 #Separate data into subsets based on the degree status
 nodegree <- subset(lalonde, nodegr == 1)
 degree <- subset(lalonde, nodegr == 0)

 #Make scatterplot of the data
 plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree")
 #abline(nodegreemod)
 #Linear model fitting
 nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree)
 summary(nodegreemod)
 #Estimating confidence intervals
 confint(nodegreemod)

 #Part 2 of the problem
 #Make scatterplot of the data
 plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree")
 #abline(nodegreemod)
 #Linear model fitting
 degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree)
 summary(degreemod)
 #Estimating confidence intervals
 confint(degreemod)


 #QUESTION 2
 #Create the interaction variable
 #Centering
 #nodegrc <- nodegr-mean(nodegr)

 #treatc <- treat-mean(treat)
 #creating actual interaction term by multiplication
 #degtreat <- nodegrc*treatc
 #Finalize the model with the interaction term 
 interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde)
 summary(interactionmodel)
 #Confidence interval
 confint(interactionmodel, level=0.95)
 #Sim function to simulate uncertainty
 interactionmodelsim <- sim(interactionmodel)
 coef(interactionmodelsim)

 #mean of the columns
 meanofcoeff <- apply(coef(interactionmodelsim), 2, mean)  #Means of the simulated coefficients
 sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients

 #95 percent confidence interval is 1.96SD from the mean therefore:
 low.bound <- meanofcoeff - 1.96*sdofcoeff
 low.bound

 upper.bound <- meanofcoeff + 1.96*sdofcoeff
 upper.bound

 #Data visualization
 #install.packages("sjPlot")
 library(sjPlot)
 sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T)
 sjp.int(interactionmodel, show.values = T, show.ci = T)


 #QUESTION 3
 #Since we will be running a logistic regression, we need re78 to be binary therefore
 lalonde$u78[lalonde$re78<=0] <- 0 
 lalonde$u78[lalonde$re78>0] <- 1

 #Separating data into subsets based on degree status
 logistic.nodegree <- subset(lalonde, nodegr == 1)
 logistic.degree <- subset(lalonde, nodegr == 0)

 #Running logistic regression when nodegree = true
 logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial)
 summary(logisticmod1)
 confint(logisticmod1)

 #Logistic regression where nodegree = false
 logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial)
 summary(logisticmod2)
 confint(logisticmod2)



 #PROBLEM 4
 #Guidance From https://datascienceplus.com/random-forests-in-r/

 require(randomForest)
 set.seed(1)
 #check dimensions for lalonde
 dim(lalonde)

 #The random forest will incorporate all predictors

 #Separating training and test sets
 trainset=sample(1:nrow(lalonde),300, replace = TRUE)
 lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE)
 lalonde.rf

 plot(lalonde.rf)

 #Comparing out of bag sample errors and test set errors
 oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde
 test.err=double(12)

 #mtry is no of Variables randomly chosen at each split. We hope to see no with least error
 for(mtry in 1:12) 
 {
  rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300) 
  oob.err[mtry] = rf$mse[300] #Error of all Trees fitted
  
  pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree
  test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error
  
  cat(mtry," ") #printing the output to the console
  
 }

 test.err
 oob.err

 matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
 legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))

 varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot")
	#Load library and data
	library("Matching")
	data("lalonde")

	#Separate data into subsets based on the degree status
	nodegree <- subset(lalonde, nodegr == 1)
	degree <- subset(lalonde, nodegr == 0)

	#Make scatterplot of the data
	plot(re78 ~ treat+educ+age+re74+re75,data=nodegree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for without degree")
	#abline(nodegreemod)
	#Linear model fitting
	nodegreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=nodegree)
	summary(nodegreemod)
	#Estimating confidence intervals
	confint(nodegreemod)

	#Part 2 of the problem
	#Make scatterplot of the data
	plot(re78 ~ treat+educ+age+re74+re75,data=degree, xlab="Variables", ylab="Real earnings 1978", main="Linear regression for with degree")
	#abline(nodegreemod)
	#Linear model fitting
	degreemod <- lm(formula = re78 ~ treat+educ+age+re74+re75,data=degree)
	summary(degreemod)
	#Estimating confidence intervals
	confint(degreemod)


	#QUESTION 2
	#Create the interaction variable
	#Centering
	#nodegrc <- nodegr-mean(nodegr)

	#treatc <- treat-mean(treat)
	#creating actual interaction term by multiplication
	#degtreat <- nodegrc*treatc
	#Finalize the model with the interaction term
	interactionmodel <- lm(formula = re78 ~ treat+educ+age+re74+re75+I(nodegr*treat),data=lalonde)
	summary(interactionmodel)
	#Confidence interval
	confint(interactionmodel, level=0.95)
	#Sim function to simulate uncertainty
	interactionmodelsim <- sim(interactionmodel)
	coef(interactionmodelsim)

	#mean of the columns
	meanofcoeff <- apply(coef(interactionmodelsim), 2, mean) #Means of the simulated coefficients
	sdofcoeff <- apply(coef(interactionmodelsim), 2, sd) # Standard deviations of simulated coefficients

	#95 percent confidence interval is 1.96SD from the mean therefore:
	low.bound <- meanofcoeff - 1.96*sdofcoeff
	low.bound

	upper.bound <- meanofcoeff + 1.96*sdofcoeff
	upper.bound

	#Data visualization
	#install.packages("sjPlot")
	library(sjPlot)
	sjp.lm(interactionmodel, show.loess.ci = T, show.values = T, show.summary = T)
	sjp.int(interactionmodel, show.values = T, show.ci = T)


	#QUESTION 3
	#Since we will be running a logistic regression, we need re78 to be binary therefore
	lalonde$u78[lalonde$re78<=0] <- 0
	lalonde$u78[lalonde$re78>0] <- 1

	#Separating data into subsets based on degree status
	logistic.nodegree <- subset(lalonde, nodegr == 1)
	logistic.degree <- subset(lalonde, nodegr == 0)

	#Running logistic regression when nodegree = true
	logisticmod1 <- glm(u78~treat+educ+age+re74+re75, data = logistic.nodegree, family = binomial)
	summary(logisticmod1)
	confint(logisticmod1)

	#Logistic regression where nodegree = false
	logisticmod2 <- glm(u78~treat+educ+age+re74+re75, data = logistic.degree, family = binomial)
	summary(logisticmod2)
	confint(logisticmod2)



	#PROBLEM 4
	#Guidance From https://datascienceplus.com/random-forests-in-r/

	require(randomForest)
	set.seed(1)
	#check dimensions for lalonde
	dim(lalonde)

	#The random forest will incorporate all predictors

	#Separating training and test sets
	trainset=sample(1:nrow(lalonde),300, replace = TRUE)
	lalonde.rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, importance=TRUE)
	lalonde.rf

	plot(lalonde.rf)

	#Comparing out of bag sample errors and test set errors
	oob.err=double(12) #because the dimensions for there are 12 predictors and 1 dependent var in lalonde
	test.err=double(12)

	#mtry is no of Variables randomly chosen at each split. We hope to see no with least error
	for(mtry in 1:12)
	{
	rf=randomForest(re78 ~ . , data = lalonde , subset = trainset, mtry = mtry, ntree=300)
	oob.err[mtry] = rf$mse[300] #Error of all Trees fitted

	pred<-predict(rf,lalonde[-trainset,]) #Predictions on Test Set for each Tree
	test.err[mtry]= with(lalonde[-trainset,], mean( (re78 - pred)^2)) #Mean Squared Test Error

	cat(mtry," ") #printing the output to the console

	}

	test.err
	oob.err

	matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
	legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))

	varImpPlot(lalonde.rf, main = "Trained forest Variable Importance Plot")