Skip to content

Instantly share code, notes, and snippets.

@janfait
Created April 5, 2018 14:33
Show Gist options
  • Save janfait/722c035b878826f5d876d2dbf28b4469 to your computer and use it in GitHub Desktop.
Save janfait/722c035b878826f5d876d2dbf28b4469 to your computer and use it in GitHub Desktop.
######################################################################################################
# SETUP
######################################################################################################
#libraries
library(ggplot2)
library(scales)
library(dplyr)
library(reshape2)
library(stringr)
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
######################################################################################################
# GLOBAL VARIABLES
######################################################################################################
FILE_PATH = "~/Data/BM/TM_Reseg_Raw_Data.csv"
TRAIN_SHARE = 0.8
TARGET_VAR = "target"
ID_VAR = "CompanyIN"
######################################################################################################
# DATA
######################################################################################################
data <- list()
data$raw <- read.csv(FILE_PATH,sep=",",header=T,stringsAsFactors = F,fileEncoding = "UTF-8")
######################################################################################################
# CLEAN UP
######################################################################################################
#replace NULL character values with NA
data$raw[data$raw=="NULL"] <- NA
data$raw$LegalFormID[is.na(data$raw$LegalFormID)]<-0
data$raw$LegalForm[is.na(data$raw$LegalForm)]<-"None"
#convert mysterious characater vectors to numerics
data$raw$ActivityIndicatorScore <- as.numeric(data$raw$ActivityIndicatorScore)
data$raw$ReachibilityIndicatorScore <- as.numeric(data$raw$ReachibilityIndicatorScore)
data$raw$EmployeeLowerBound <- as.numeric(data$raw$EmployeeLowerBound)
data$raw$TurnoverMinCzkThs <- as.numeric(data$raw$TurnoverMinCzkThs)
#factorize the string
data$raw$LegalFormAggregated <- factor(data$raw$LegalFormAggregated)
#define an impute function as an inputation of a trimmed mean
impute = function(data,k,trim=0.1,max_na_share = 0.2){
na_share = sum(is.na(data[,k]))/nrow(data)
if(na_share > max_na_share){
stop(sprintf(fmt = "A maximum share of %f reached with %f",na_share,max_na_share))
}
print(paste("NA share = ",na_share))
k <- data[,k]
t_mean = mean(k,trim = trim,na.rm = T)
k[is.na(k) | is.nan(k)] <- t_mean
return(k)
}
table(is.na(data$raw$ActivityIndicatorScore))
data$raw$ActivityIndicatorScore <- impute(data = data$raw, k = "ActivityIndicatorScore")
data$raw$TurnoverMinCzkThs <- impute(data = data$raw, k = "TurnoverMinCzkThs")
data$raw$EmployeeLowerBound <- impute(data = data$raw, k = "EmployeeLowerBound")
######################################################################################################
# TARGET VARIABLE
######################################################################################################
data$raw$target <- 1
data$raw$target[data$raw$Potential < 3001] <- 0
prop.table(table(data$raw$target))
######################################################################################################
# DESCRIPTIVES
######################################################################################################
#select numerics
numerics <- sapply(colnames(data$raw),function(col) is.numeric(data$raw[,col]))
#describe the data set, if we see zero variance variables, we might drop them
psych::describe(data$raw[,numerics])[,c("mean","median","sd","min","max","range")]
#inspecting correlations
corMat <- cor(data$raw[,numerics])
#fancy corrplot
heatmap(corMat)
#correlation of predictors with target
sapply(data$raw[,numerics],function(k) cor(k,data$raw$target))
#share of missing values in predictors
sapply(data$raw[,numerics],function(k) prop.table(table(is.na(k))))
#how many categories does each predictor have
sort(sapply(data$raw,function(k) length(unique(k))),decreasing=T)
######################################################################################################
# PREDICTORS
######################################################################################################
exclude <- c("CurrentMobileSpendID","Potential","LegalFormID","SegmentID","MarkerCount","PrimaryNACE","Handover_SegmentID","Handover_CorrectSegment","District","InstitutionalSector","GrowthIndicatorScore")
missing <- c("ReachibilityIndicatorScore")
predictors <- !(colnames(data$raw) %in% c(exclude,missing))
colnames(data$raw[,predictors])
######################################################################################################
# DATA PARTITION
######################################################################################################
#seed for reproducibility
set.seed(1111)
#define partition for sampling
sample <- createDataPartition(data$raw[,TARGET_VAR],p=TRAIN_SHARE,list=F)
#create train and test datasets
data$train <- data$raw[sample,predictors]
#create another partition for a validation sample
validation <- createDataPartition(data$train[,TARGET_VAR],p=0.2,list=F)
data$validation <- data$train[validation,]
data$train <- data$train[-validation,]
#create test sample
data$test <- data$raw[-sample,predictors]
# go for 93% accuracy
nrow(data$train);nrow(data$train[complete.cases(data$train),])
# go for 85% accuracy
nrow(data$validation);nrow(data$validation[complete.cases(data$validation),])
#go for ~%
nrow(data$test);nrow(data$test[complete.cases(data$test),])
#define factors for the train/test procedure
target <- list()
target$train <- factor(data$train[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
target$validation <- factor(data$validation[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
target$test <- factor(data$test[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME"))
######################################################################################################
# MODELS
######################################################################################################
#create a model and predictions container
models <- list()
predictions <- list(
validation = list(),
test = list()
)
##############
# LM
##############
models$lm1 <- glm(formula = "target ~ . ", data = data$train, family = binomial(link='logit'))
predictions$validation$lm = predict(models$lm1,data$validation)
predictions$test$lm = predict(models$lm1,data$test)
##############
# RANDOM FORESTS
#
# To get an idea of possible optimal performance, I use a more powerful model first
# Random forests are powerful and more easily understood compred to other black-box models.
##############
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specifity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
#seed for reproducibility
set.seed(1111)
#define the training control object, shared by methods
trainControlRF <- trainControl(
method = "cv",
number = 2,
verboseIter = T,
search = "grid",
summaryFunction = fourStats,
classProbs=T
)
gridRF <- expand.grid(mtry=c(7,9,10))
#store results of the fit
models$RF <- train(
x = data$train[,!names(data$train) %in% c(TARGET_VAR,ID_VAR)],
y = target$train,
method="rf",
metric="Spec",
nodesize = 30,
maxdepth = 15,
ntree = 1000,
trControl=trainControlRF,
tuneGrid = gridRF,
verbose=T,
preProc = "knnImpute",
na.action = na.pass
)
print(models$RF)
#get variable importance for levels of predictors
varImp(models$RF)
#see model summary
models$RF
#predict using model
predictions$validation$RF<- predict(models$RF,data$validation)
#confusion matrix for binary classifer
confusionMatrix(predictions$validation$RF,target$validation,positive="SME")
#predict using model
predictions$test$RF<- predict(models$RF,data$test)
#confusion matrix for binary classifer
confusionMatrix(predictions$test$RF,target$test,positive="SME")
##############
# XGB
##############
#prepare special data format for xgb
#[,-which(colnames(data$train) == TARGET_VAR)]
data$train_xgb <- xgb.DMatrix(data = data.matrix(data$train[,-which(colnames(data$train) == TARGET_VAR)]), label=data$train$target)
data$validation_xgb <- xgb.DMatrix(data = data.matrix(data$validation[,-which(colnames(data$train) == TARGET_VAR)]), label=data$validation$target)
data$test_xgb <- xgb.DMatrix(data = data.matrix(data$test[,-which(colnames(data$train) == TARGET_VAR)]), label=data$test$target)
#define a watchlist
models$xgb_watchlist <- list(train=data$train_xgb, test=data$validation_xgb)
models$xgb = xgb.train(
params=list(
"booster" = "gbtree",
"objective" = "binary:logistic",
"eval_metric" = "error",
"eval_metric" = "logloss",
"eval_metric" = "auc",
"max_depth" = 15,
"silent" = -1
),
data = data$train_xgb,
watchlist=models$xgb_watchlist,
nrounds = 100,
verbose = 2,
seed=123
)
"max_depth" = 4,
"subsample" = 0.5,
"colsample_bytree" = 0.7,
"gamma"= 0.1,
predictions$validation$xgb <- predict(models$xgb,data.matrix(data$validation))
predictions$validation$xgb <- factor(round(predictions$validation$xgb),levels=c(0,1),labels=c("VSE","SME"))
confusionMatrix(predictions$validation$xgb,target$validation,positive="SME")
#test (open opps)
predictions$test$xgb <- predict(models$xgb,data.matrix(data$test))
predictions$test$xgb <- factor(round(predictions$test$xgb),levels=c(0,1),labels=c("VSE","SME"))
confusionMatrix(predictions$test$xgb,target$test,positive="SME")
##############
# TEST
##############
truePositiveRates <- c(
"XGB"=sensitivity(predictions$test$xgb,target$test),
"RF"=sensitivity(predictions$test$RF,target$test)
)
falsePositiveRates <- c(
"XGB"=1-specificity(predictions$test$xgb,target$test),
"RF"=1-specificity(predictions$test$RF,target$test)
)
#improvised ROC plot, need to adjust the scales, otherwise not useful
plot(x=falsePositiveRates,
y=truePositiveRates,
xlim=c(0,1),
ylim=c(0,1),
main="Model accuracy plot",
xlab= "False Negative Rate \n (% failed to classified as SME when SME)",
ylab= "True Positive Rate \n (% correctly classified from all cases)", pch = 19, cex = 1)
abline(0,1)
text(x=falsePositiveRates+0.05,y=truePositiveRates,names(truePositiveRates),cex=0.5)
##############
# EXAMINE MISSCLASSIFIED
##############
residuals <- data.frame(
"prob" = predict(models$RF,data$validation,type="prob"),
"true" = data$validation$target
)
residuals$dev <- residuals$true - residuals$prob.SME
residuals$fneg <- residuals$dev > 0
residuals$fneg_strong <- residuals$dev > 0.4
table(residuals$fneg_strong)
hist(residuals$dev)
misfits <- data$validation[residuals$fneg_strong,]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment