janfait · April 5, 2018 14:33
diff --git a/bm__tmob_segment.R b/bm__tmob_segment.R
 ######################################################################################################
 # SETUP
 ######################################################################################################

 #libraries
 library(ggplot2)
 library(scales)
 library(dplyr)
 library(reshape2)
 library(stringr)
 library(rpart)
 library(rpart.plot)
 library(randomForest)
 library(caret)

 ######################################################################################################
 # GLOBAL VARIABLES
 ######################################################################################################

 FILE_PATH = "~/Data/BM/TM_Reseg_Raw_Data.csv"
 TRAIN_SHARE = 0.8
 TARGET_VAR = "target"
 ID_VAR = "CompanyIN"

 ######################################################################################################
 # DATA
 ######################################################################################################

 data <- list()
 data$raw <- read.csv(FILE_PATH,sep=",",header=T,stringsAsFactors = F,fileEncoding = "UTF-8")

 ######################################################################################################
 # CLEAN UP
 ######################################################################################################

 #replace NULL character values with NA
 data$raw[data$raw=="NULL"] <- NA
 data$raw$LegalFormID[is.na(data$raw$LegalFormID)]<-0
 data$raw$LegalForm[is.na(data$raw$LegalForm)]<-"None"

 #convert mysterious characater vectors to numerics
 data$raw$ActivityIndicatorScore <- as.numeric(data$raw$ActivityIndicatorScore)
 data$raw$ReachibilityIndicatorScore <- as.numeric(data$raw$ReachibilityIndicatorScore)
 data$raw$EmployeeLowerBound <- as.numeric(data$raw$EmployeeLowerBound)
 data$raw$TurnoverMinCzkThs <- as.numeric(data$raw$TurnoverMinCzkThs)

 #factorize the string
 data$raw$LegalFormAggregated <- factor(data$raw$LegalFormAggregated)

 #define an impute function as an inputation of a trimmed mean 
 impute = function(data,k,trim=0.1,max_na_share = 0.2){
  
  na_share = sum(is.na(data[,k]))/nrow(data)
  if(na_share > max_na_share){
    stop(sprintf(fmt = "A maximum share of %f reached with %f",na_share,max_na_share))
  }
  print(paste("NA share = ",na_share))
  k <- data[,k]
  t_mean = mean(k,trim = trim,na.rm = T)
  k[is.na(k) | is.nan(k)] <- t_mean
  return(k)
 }

 table(is.na(data$raw$ActivityIndicatorScore))

 data$raw$ActivityIndicatorScore <- impute(data = data$raw, k = "ActivityIndicatorScore")
 data$raw$TurnoverMinCzkThs <- impute(data = data$raw, k = "TurnoverMinCzkThs")
 data$raw$EmployeeLowerBound <- impute(data = data$raw, k = "EmployeeLowerBound")

 ######################################################################################################
 # TARGET VARIABLE
 ######################################################################################################

 data$raw$target <- 1
 data$raw$target[data$raw$Potential < 3001] <- 0
 prop.table(table(data$raw$target))
 
 ######################################################################################################
 # DESCRIPTIVES
 ######################################################################################################

 #select numerics
 numerics <- sapply(colnames(data$raw),function(col) is.numeric(data$raw[,col]))

 #describe the data set, if we see zero variance variables, we might drop them
 psych::describe(data$raw[,numerics])[,c("mean","median","sd","min","max","range")]

 #inspecting correlations
 corMat <-  cor(data$raw[,numerics])
 #fancy corrplot
 heatmap(corMat)

 #correlation of predictors with target
 sapply(data$raw[,numerics],function(k) cor(k,data$raw$target))
 #share of missing values in predictors
 sapply(data$raw[,numerics],function(k) prop.table(table(is.na(k))))
 #how many categories does each predictor have
 sort(sapply(data$raw,function(k) length(unique(k))),decreasing=T)

 ######################################################################################################
 # PREDICTORS
 ######################################################################################################

 exclude <- c("CurrentMobileSpendID","Potential","LegalFormID","SegmentID","MarkerCount","PrimaryNACE","Handover_SegmentID","Handover_CorrectSegment","District","InstitutionalSector","GrowthIndicatorScore")
 missing <- c("ReachibilityIndicatorScore")
 predictors <- !(colnames(data$raw) %in% c(exclude,missing))
 colnames(data$raw[,predictors])

 ######################################################################################################
 # DATA PARTITION
 ######################################################################################################

 #seed for reproducibility
 set.seed(1111)

 #define partition for sampling
 sample <- createDataPartition(data$raw[,TARGET_VAR],p=TRAIN_SHARE,list=F)

 #create train and test datasets
 data$train <- data$raw[sample,predictors]
 #create another partition for a validation sample
 validation <- createDataPartition(data$train[,TARGET_VAR],p=0.2,list=F)
 data$validation <- data$train[validation,]
 data$train <- data$train[-validation,]
 #create test sample
 data$test <- data$raw[-sample,predictors]

 # go for  93% accuracy
 nrow(data$train);nrow(data$train[complete.cases(data$train),])
 # go for 85% accuracy
 nrow(data$validation);nrow(data$validation[complete.cases(data$validation),])
 #go for ~%
 nrow(data$test);nrow(data$test[complete.cases(data$test),])


 #define factors for the train/test procedure
 target <- list()
 target$train <- factor(data$train[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
 target$validation <- factor(data$validation[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
 target$test <- factor(data$test[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))

 ######################################################################################################
 # MODELS
 ######################################################################################################

 #create a model and predictions container
 models <- list()
 predictions <- list(
  validation = list(),
  test = list()
 )

 ##############
 # LM
 ##############

 models$lm1 <- glm(formula = "target ~ . ", data = data$train, family = binomial(link='logit'))

 predictions$validation$lm = predict(models$lm1,data$validation)
 predictions$test$lm = predict(models$lm1,data$test)

 ##############
 # RANDOM FORESTS
 #
 # To get an idea of possible optimal performance, I use a more powerful model first
 # Random forests are powerful and more easily understood compred to other black-box models.
 ##############

 fourStats <- function (data, lev = levels(data$obs), model = NULL) {
  ## This code will get use the area under the ROC curve and the
  ## sensitivity and specificity values using the current candidate
  ## value of the probability threshold.
  out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
  
  ## The best possible model has sensitivity of 1 and specifity of 1. 
  ## How far are we from that value?
  coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), 
                   ncol = 2, 
                   byrow = TRUE)
  colnames(coords) <- c("Spec", "Sens")
  rownames(coords) <- c("Best", "Current")
  c(out, Dist = dist(coords)[1])
 }

 #seed for reproducibility
 set.seed(1111)
 #define the training control object, shared by methods
 trainControlRF <- trainControl(
  method = "cv",
  number = 2,
  verboseIter = T,
  search = "grid",
  summaryFunction = fourStats,
  classProbs=T
 )
 gridRF <- expand.grid(mtry=c(7,9,10))
 #store results of the fit
 models$RF <- train(
  x = data$train[,!names(data$train) %in% c(TARGET_VAR,ID_VAR)],
  y = target$train,
  method="rf",
  metric="Spec",
  nodesize = 30,
  maxdepth = 15,
  ntree = 1000,
  trControl=trainControlRF,
  tuneGrid = gridRF,
  verbose=T,
  preProc = "knnImpute",
  na.action = na.pass
 )
 print(models$RF)
 #get variable importance for levels of predictors
 varImp(models$RF)
 #see model summary
 models$RF
 #predict using model
 predictions$validation$RF<- predict(models$RF,data$validation)
 #confusion matrix for binary classifer
 confusionMatrix(predictions$validation$RF,target$validation,positive="SME")

 #predict using model
 predictions$test$RF<- predict(models$RF,data$test)
 #confusion matrix for binary classifer
 confusionMatrix(predictions$test$RF,target$test,positive="SME")

 ##############
 # XGB
 ##############

 #prepare special data format for xgb
 #[,-which(colnames(data$train) == TARGET_VAR)]
 data$train_xgb <- xgb.DMatrix(data = data.matrix(data$train[,-which(colnames(data$train) == TARGET_VAR)]), label=data$train$target)
 data$validation_xgb <- xgb.DMatrix(data = data.matrix(data$validation[,-which(colnames(data$train) == TARGET_VAR)]), label=data$validation$target)
 data$test_xgb <- xgb.DMatrix(data = data.matrix(data$test[,-which(colnames(data$train) == TARGET_VAR)]), label=data$test$target)
 #define a watchlist
 models$xgb_watchlist <- list(train=data$train_xgb, test=data$validation_xgb)

 models$xgb = xgb.train(
  params=list(
    "booster" = "gbtree",
    "objective" = "binary:logistic",
    "eval_metric" = "error",
    "eval_metric" = "logloss",
    "eval_metric" = "auc",
    "max_depth" = 15,
    "silent" = -1
  ),
  data = data$train_xgb,
  watchlist=models$xgb_watchlist,
  nrounds = 100,
  verbose = 2,
  seed=123
 )

 "max_depth" = 4,
 "subsample" = 0.5,
 "colsample_bytree" = 0.7,
 "gamma"= 0.1,

 predictions$validation$xgb <- predict(models$xgb,data.matrix(data$validation))
 predictions$validation$xgb <- factor(round(predictions$validation$xgb),levels=c(0,1),labels=c("VSE","SME"))
 confusionMatrix(predictions$validation$xgb,target$validation,positive="SME")

 #test (open opps)
 predictions$test$xgb <- predict(models$xgb,data.matrix(data$test))
 predictions$test$xgb <- factor(round(predictions$test$xgb),levels=c(0,1),labels=c("VSE","SME"))
 confusionMatrix(predictions$test$xgb,target$test,positive="SME")

 ##############
 # TEST
 ##############

 truePositiveRates <- c(
  "XGB"=sensitivity(predictions$test$xgb,target$test),
  "RF"=sensitivity(predictions$test$RF,target$test)
 )
 falsePositiveRates <- c(
  "XGB"=1-specificity(predictions$test$xgb,target$test),
  "RF"=1-specificity(predictions$test$RF,target$test)
 )

 #improvised ROC plot, need to adjust the scales, otherwise not useful
 plot(x=falsePositiveRates,
     y=truePositiveRates,
     xlim=c(0,1),
     ylim=c(0,1),
     main="Model accuracy plot",
     xlab= "False Negative Rate \n (% failed to classified  as SME when SME)",
     ylab= "True Positive Rate \n (% correctly classified from all cases)", pch = 19, cex = 1)
 abline(0,1)
 text(x=falsePositiveRates+0.05,y=truePositiveRates,names(truePositiveRates),cex=0.5)

 ##############
 # EXAMINE MISSCLASSIFIED
 ##############

 residuals <- data.frame(
  "prob" = predict(models$RF,data$validation,type="prob"),
  "true" = data$validation$target
 )
 residuals$dev <- residuals$true - residuals$prob.SME
 residuals$fneg <- residuals$dev > 0
 residuals$fneg_strong  <- residuals$dev > 0.4
 table(residuals$fneg_strong)
 hist(residuals$dev)

 misfits <- data$validation[residuals$fneg_strong,]
	######################################################################################################
	# SETUP
	######################################################################################################

	#libraries
	library(ggplot2)
	library(scales)
	library(dplyr)
	library(reshape2)
	library(stringr)
	library(rpart)
	library(rpart.plot)
	library(randomForest)
	library(caret)

	######################################################################################################
	# GLOBAL VARIABLES
	######################################################################################################

	FILE_PATH = "~/Data/BM/TM_Reseg_Raw_Data.csv"
	TRAIN_SHARE = 0.8
	TARGET_VAR = "target"
	ID_VAR = "CompanyIN"

	######################################################################################################
	# DATA
	######################################################################################################

	data <- list()
	data$raw <- read.csv(FILE_PATH,sep=",",header=T,stringsAsFactors = F,fileEncoding = "UTF-8")

	######################################################################################################
	# CLEAN UP
	######################################################################################################

	#replace NULL character values with NA
	data$raw[data$raw=="NULL"] <- NA
	data$raw$LegalFormID[is.na(data$raw$LegalFormID)]<-0
	data$raw$LegalForm[is.na(data$raw$LegalForm)]<-"None"

	#convert mysterious characater vectors to numerics
	data$raw$ActivityIndicatorScore <- as.numeric(data$raw$ActivityIndicatorScore)
	data$raw$ReachibilityIndicatorScore <- as.numeric(data$raw$ReachibilityIndicatorScore)
	data$raw$EmployeeLowerBound <- as.numeric(data$raw$EmployeeLowerBound)
	data$raw$TurnoverMinCzkThs <- as.numeric(data$raw$TurnoverMinCzkThs)

	#factorize the string
	data$raw$LegalFormAggregated <- factor(data$raw$LegalFormAggregated)

	#define an impute function as an inputation of a trimmed mean
	impute = function(data,k,trim=0.1,max_na_share = 0.2){

	na_share = sum(is.na(data[,k]))/nrow(data)
	if(na_share > max_na_share){
	stop(sprintf(fmt = "A maximum share of %f reached with %f",na_share,max_na_share))
	}
	print(paste("NA share = ",na_share))
	k <- data[,k]
	t_mean = mean(k,trim = trim,na.rm = T)
	k[is.na(k) \| is.nan(k)] <- t_mean
	return(k)
	}

	table(is.na(data$raw$ActivityIndicatorScore))

	data$raw$ActivityIndicatorScore <- impute(data = data$raw, k = "ActivityIndicatorScore")
	data$raw$TurnoverMinCzkThs <- impute(data = data$raw, k = "TurnoverMinCzkThs")
	data$raw$EmployeeLowerBound <- impute(data = data$raw, k = "EmployeeLowerBound")

	######################################################################################################
	# TARGET VARIABLE
	######################################################################################################

	data$raw$target <- 1
	data$raw$target[data$raw$Potential < 3001] <- 0
	prop.table(table(data$raw$target))

	######################################################################################################
	# DESCRIPTIVES
	######################################################################################################

	#select numerics
	numerics <- sapply(colnames(data$raw),function(col) is.numeric(data$raw[,col]))

	#describe the data set, if we see zero variance variables, we might drop them
	psych::describe(data$raw[,numerics])[,c("mean","median","sd","min","max","range")]

	#inspecting correlations
	corMat <- cor(data$raw[,numerics])
	#fancy corrplot
	heatmap(corMat)

	#correlation of predictors with target
	sapply(data$raw[,numerics],function(k) cor(k,data$raw$target))
	#share of missing values in predictors
	sapply(data$raw[,numerics],function(k) prop.table(table(is.na(k))))
	#how many categories does each predictor have
	sort(sapply(data$raw,function(k) length(unique(k))),decreasing=T)

	######################################################################################################
	# PREDICTORS
	######################################################################################################

	exclude <- c("CurrentMobileSpendID","Potential","LegalFormID","SegmentID","MarkerCount","PrimaryNACE","Handover_SegmentID","Handover_CorrectSegment","District","InstitutionalSector","GrowthIndicatorScore")
	missing <- c("ReachibilityIndicatorScore")
	predictors <- !(colnames(data$raw) %in% c(exclude,missing))
	colnames(data$raw[,predictors])

	######################################################################################################
	# DATA PARTITION
	######################################################################################################

	#seed for reproducibility
	set.seed(1111)

	#define partition for sampling
	sample <- createDataPartition(data$raw[,TARGET_VAR],p=TRAIN_SHARE,list=F)

	#create train and test datasets
	data$train <- data$raw[sample,predictors]
	#create another partition for a validation sample
	validation <- createDataPartition(data$train[,TARGET_VAR],p=0.2,list=F)
	data$validation <- data$train[validation,]
	data$train <- data$train[-validation,]
	#create test sample
	data$test <- data$raw[-sample,predictors]

	# go for 93% accuracy
	nrow(data$train);nrow(data$train[complete.cases(data$train),])
	# go for 85% accuracy
	nrow(data$validation);nrow(data$validation[complete.cases(data$validation),])
	#go for ~%
	nrow(data$test);nrow(data$test[complete.cases(data$test),])


	#define factors for the train/test procedure
	target <- list()
	target$train <- factor(data$train[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
	target$validation <- factor(data$validation[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
	target$test <- factor(data$test[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))

	######################################################################################################
	# MODELS
	######################################################################################################

	#create a model and predictions container
	models <- list()
	predictions <- list(
	validation = list(),
	test = list()
	)

	##############
	# LM
	##############

	models$lm1 <- glm(formula = "target ~ . ", data = data$train, family = binomial(link='logit'))

	predictions$validation$lm = predict(models$lm1,data$validation)
	predictions$test$lm = predict(models$lm1,data$test)

	##############
	# RANDOM FORESTS
	#
	# To get an idea of possible optimal performance, I use a more powerful model first
	# Random forests are powerful and more easily understood compred to other black-box models.
	##############

	fourStats <- function (data, lev = levels(data$obs), model = NULL) {
	## This code will get use the area under the ROC curve and the
	## sensitivity and specificity values using the current candidate
	## value of the probability threshold.
	out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))

	## The best possible model has sensitivity of 1 and specifity of 1.
	## How far are we from that value?
	coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
	ncol = 2,
	byrow = TRUE)
	colnames(coords) <- c("Spec", "Sens")
	rownames(coords) <- c("Best", "Current")
	c(out, Dist = dist(coords)[1])
	}

	#seed for reproducibility
	set.seed(1111)
	#define the training control object, shared by methods
	trainControlRF <- trainControl(
	method = "cv",
	number = 2,
	verboseIter = T,
	search = "grid",
	summaryFunction = fourStats,
	classProbs=T
	)
	gridRF <- expand.grid(mtry=c(7,9,10))
	#store results of the fit
	models$RF <- train(
	x = data$train[,!names(data$train) %in% c(TARGET_VAR,ID_VAR)],
	y = target$train,
	method="rf",
	metric="Spec",
	nodesize = 30,
	maxdepth = 15,
	ntree = 1000,
	trControl=trainControlRF,
	tuneGrid = gridRF,
	verbose=T,
	preProc = "knnImpute",
	na.action = na.pass
	)
	print(models$RF)
	#get variable importance for levels of predictors
	varImp(models$RF)
	#see model summary
	models$RF
	#predict using model
	predictions$validation$RF<- predict(models$RF,data$validation)
	#confusion matrix for binary classifer
	confusionMatrix(predictions$validation$RF,target$validation,positive="SME")

	#predict using model
	predictions$test$RF<- predict(models$RF,data$test)
	#confusion matrix for binary classifer
	confusionMatrix(predictions$test$RF,target$test,positive="SME")

	##############
	# XGB
	##############

	#prepare special data format for xgb
	#[,-which(colnames(data$train) == TARGET_VAR)]
	data$train_xgb <- xgb.DMatrix(data = data.matrix(data$train[,-which(colnames(data$train) == TARGET_VAR)]), label=data$train$target)
	data$validation_xgb <- xgb.DMatrix(data = data.matrix(data$validation[,-which(colnames(data$train) == TARGET_VAR)]), label=data$validation$target)
	data$test_xgb <- xgb.DMatrix(data = data.matrix(data$test[,-which(colnames(data$train) == TARGET_VAR)]), label=data$test$target)
	#define a watchlist
	models$xgb_watchlist <- list(train=data$train_xgb, test=data$validation_xgb)

	models$xgb = xgb.train(
	params=list(
	"booster" = "gbtree",
	"objective" = "binary:logistic",
	"eval_metric" = "error",
	"eval_metric" = "logloss",
	"eval_metric" = "auc",
	"max_depth" = 15,
	"silent" = -1
	),
	data = data$train_xgb,
	watchlist=models$xgb_watchlist,
	nrounds = 100,
	verbose = 2,
	seed=123
	)

	"max_depth" = 4,
	"subsample" = 0.5,
	"colsample_bytree" = 0.7,
	"gamma"= 0.1,

	predictions$validation$xgb <- predict(models$xgb,data.matrix(data$validation))
	predictions$validation$xgb <- factor(round(predictions$validation$xgb),levels=c(0,1),labels=c("VSE","SME"))
	confusionMatrix(predictions$validation$xgb,target$validation,positive="SME")

	#test (open opps)
	predictions$test$xgb <- predict(models$xgb,data.matrix(data$test))
	predictions$test$xgb <- factor(round(predictions$test$xgb),levels=c(0,1),labels=c("VSE","SME"))
	confusionMatrix(predictions$test$xgb,target$test,positive="SME")

	##############
	# TEST
	##############

	truePositiveRates <- c(
	"XGB"=sensitivity(predictions$test$xgb,target$test),
	"RF"=sensitivity(predictions$test$RF,target$test)
	)
	falsePositiveRates <- c(
	"XGB"=1-specificity(predictions$test$xgb,target$test),
	"RF"=1-specificity(predictions$test$RF,target$test)
	)

	#improvised ROC plot, need to adjust the scales, otherwise not useful
	plot(x=falsePositiveRates,
	y=truePositiveRates,
	xlim=c(0,1),
	ylim=c(0,1),
	main="Model accuracy plot",
	xlab= "False Negative Rate \n (% failed to classified as SME when SME)",
	ylab= "True Positive Rate \n (% correctly classified from all cases)", pch = 19, cex = 1)
	abline(0,1)
	text(x=falsePositiveRates+0.05,y=truePositiveRates,names(truePositiveRates),cex=0.5)

	##############
	# EXAMINE MISSCLASSIFIED
	##############

	residuals <- data.frame(
	"prob" = predict(models$RF,data$validation,type="prob"),
	"true" = data$validation$target
	)
	residuals$dev <- residuals$true - residuals$prob.SME
	residuals$fneg <- residuals$dev > 0
	residuals$fneg_strong <- residuals$dev > 0.4
	table(residuals$fneg_strong)
	hist(residuals$dev)

	misfits <- data$validation[residuals$fneg_strong,]