Created
April 5, 2018 14:33
-
-
Save janfait/722c035b878826f5d876d2dbf28b4469 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################################################################################################### | |
# SETUP | |
###################################################################################################### | |
#libraries | |
library(ggplot2) | |
library(scales) | |
library(dplyr) | |
library(reshape2) | |
library(stringr) | |
library(rpart) | |
library(rpart.plot) | |
library(randomForest) | |
library(caret) | |
###################################################################################################### | |
# GLOBAL VARIABLES | |
###################################################################################################### | |
FILE_PATH = "~/Data/BM/TM_Reseg_Raw_Data.csv" | |
TRAIN_SHARE = 0.8 | |
TARGET_VAR = "target" | |
ID_VAR = "CompanyIN" | |
###################################################################################################### | |
# DATA | |
###################################################################################################### | |
data <- list() | |
data$raw <- read.csv(FILE_PATH,sep=",",header=T,stringsAsFactors = F,fileEncoding = "UTF-8") | |
###################################################################################################### | |
# CLEAN UP | |
###################################################################################################### | |
#replace NULL character values with NA | |
data$raw[data$raw=="NULL"] <- NA | |
data$raw$LegalFormID[is.na(data$raw$LegalFormID)]<-0 | |
data$raw$LegalForm[is.na(data$raw$LegalForm)]<-"None" | |
#convert mysterious characater vectors to numerics | |
data$raw$ActivityIndicatorScore <- as.numeric(data$raw$ActivityIndicatorScore) | |
data$raw$ReachibilityIndicatorScore <- as.numeric(data$raw$ReachibilityIndicatorScore) | |
data$raw$EmployeeLowerBound <- as.numeric(data$raw$EmployeeLowerBound) | |
data$raw$TurnoverMinCzkThs <- as.numeric(data$raw$TurnoverMinCzkThs) | |
#factorize the string | |
data$raw$LegalFormAggregated <- factor(data$raw$LegalFormAggregated) | |
#define an impute function as an inputation of a trimmed mean | |
impute = function(data,k,trim=0.1,max_na_share = 0.2){ | |
na_share = sum(is.na(data[,k]))/nrow(data) | |
if(na_share > max_na_share){ | |
stop(sprintf(fmt = "A maximum share of %f reached with %f",na_share,max_na_share)) | |
} | |
print(paste("NA share = ",na_share)) | |
k <- data[,k] | |
t_mean = mean(k,trim = trim,na.rm = T) | |
k[is.na(k) | is.nan(k)] <- t_mean | |
return(k) | |
} | |
table(is.na(data$raw$ActivityIndicatorScore)) | |
data$raw$ActivityIndicatorScore <- impute(data = data$raw, k = "ActivityIndicatorScore") | |
data$raw$TurnoverMinCzkThs <- impute(data = data$raw, k = "TurnoverMinCzkThs") | |
data$raw$EmployeeLowerBound <- impute(data = data$raw, k = "EmployeeLowerBound") | |
###################################################################################################### | |
# TARGET VARIABLE | |
###################################################################################################### | |
data$raw$target <- 1 | |
data$raw$target[data$raw$Potential < 3001] <- 0 | |
prop.table(table(data$raw$target)) | |
###################################################################################################### | |
# DESCRIPTIVES | |
###################################################################################################### | |
#select numerics | |
numerics <- sapply(colnames(data$raw),function(col) is.numeric(data$raw[,col])) | |
#describe the data set, if we see zero variance variables, we might drop them | |
psych::describe(data$raw[,numerics])[,c("mean","median","sd","min","max","range")] | |
#inspecting correlations | |
corMat <- cor(data$raw[,numerics]) | |
#fancy corrplot | |
heatmap(corMat) | |
#correlation of predictors with target | |
sapply(data$raw[,numerics],function(k) cor(k,data$raw$target)) | |
#share of missing values in predictors | |
sapply(data$raw[,numerics],function(k) prop.table(table(is.na(k)))) | |
#how many categories does each predictor have | |
sort(sapply(data$raw,function(k) length(unique(k))),decreasing=T) | |
###################################################################################################### | |
# PREDICTORS | |
###################################################################################################### | |
exclude <- c("CurrentMobileSpendID","Potential","LegalFormID","SegmentID","MarkerCount","PrimaryNACE","Handover_SegmentID","Handover_CorrectSegment","District","InstitutionalSector","GrowthIndicatorScore") | |
missing <- c("ReachibilityIndicatorScore") | |
predictors <- !(colnames(data$raw) %in% c(exclude,missing)) | |
colnames(data$raw[,predictors]) | |
###################################################################################################### | |
# DATA PARTITION | |
###################################################################################################### | |
#seed for reproducibility | |
set.seed(1111) | |
#define partition for sampling | |
sample <- createDataPartition(data$raw[,TARGET_VAR],p=TRAIN_SHARE,list=F) | |
#create train and test datasets | |
data$train <- data$raw[sample,predictors] | |
#create another partition for a validation sample | |
validation <- createDataPartition(data$train[,TARGET_VAR],p=0.2,list=F) | |
data$validation <- data$train[validation,] | |
data$train <- data$train[-validation,] | |
#create test sample | |
data$test <- data$raw[-sample,predictors] | |
# go for 93% accuracy | |
nrow(data$train);nrow(data$train[complete.cases(data$train),]) | |
# go for 85% accuracy | |
nrow(data$validation);nrow(data$validation[complete.cases(data$validation),]) | |
#go for ~% | |
nrow(data$test);nrow(data$test[complete.cases(data$test),]) | |
#define factors for the train/test procedure | |
target <- list() | |
target$train <- factor(data$train[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME")) | |
target$validation <- factor(data$validation[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME")) | |
target$test <- factor(data$test[,TARGET_VAR],levels=c(0,1),labels=c("VSE","SME")) | |
###################################################################################################### | |
# MODELS | |
###################################################################################################### | |
#create a model and predictions container | |
models <- list() | |
predictions <- list( | |
validation = list(), | |
test = list() | |
) | |
############## | |
# LM | |
############## | |
models$lm1 <- glm(formula = "target ~ . ", data = data$train, family = binomial(link='logit')) | |
predictions$validation$lm = predict(models$lm1,data$validation) | |
predictions$test$lm = predict(models$lm1,data$test) | |
############## | |
# RANDOM FORESTS | |
# | |
# To get an idea of possible optimal performance, I use a more powerful model first | |
# Random forests are powerful and more easily understood compred to other black-box models. | |
############## | |
fourStats <- function (data, lev = levels(data$obs), model = NULL) { | |
## This code will get use the area under the ROC curve and the | |
## sensitivity and specificity values using the current candidate | |
## value of the probability threshold. | |
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL)) | |
## The best possible model has sensitivity of 1 and specifity of 1. | |
## How far are we from that value? | |
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), | |
ncol = 2, | |
byrow = TRUE) | |
colnames(coords) <- c("Spec", "Sens") | |
rownames(coords) <- c("Best", "Current") | |
c(out, Dist = dist(coords)[1]) | |
} | |
#seed for reproducibility | |
set.seed(1111) | |
#define the training control object, shared by methods | |
trainControlRF <- trainControl( | |
method = "cv", | |
number = 2, | |
verboseIter = T, | |
search = "grid", | |
summaryFunction = fourStats, | |
classProbs=T | |
) | |
gridRF <- expand.grid(mtry=c(7,9,10)) | |
#store results of the fit | |
models$RF <- train( | |
x = data$train[,!names(data$train) %in% c(TARGET_VAR,ID_VAR)], | |
y = target$train, | |
method="rf", | |
metric="Spec", | |
nodesize = 30, | |
maxdepth = 15, | |
ntree = 1000, | |
trControl=trainControlRF, | |
tuneGrid = gridRF, | |
verbose=T, | |
preProc = "knnImpute", | |
na.action = na.pass | |
) | |
print(models$RF) | |
#get variable importance for levels of predictors | |
varImp(models$RF) | |
#see model summary | |
models$RF | |
#predict using model | |
predictions$validation$RF<- predict(models$RF,data$validation) | |
#confusion matrix for binary classifer | |
confusionMatrix(predictions$validation$RF,target$validation,positive="SME") | |
#predict using model | |
predictions$test$RF<- predict(models$RF,data$test) | |
#confusion matrix for binary classifer | |
confusionMatrix(predictions$test$RF,target$test,positive="SME") | |
############## | |
# XGB | |
############## | |
#prepare special data format for xgb | |
#[,-which(colnames(data$train) == TARGET_VAR)] | |
data$train_xgb <- xgb.DMatrix(data = data.matrix(data$train[,-which(colnames(data$train) == TARGET_VAR)]), label=data$train$target) | |
data$validation_xgb <- xgb.DMatrix(data = data.matrix(data$validation[,-which(colnames(data$train) == TARGET_VAR)]), label=data$validation$target) | |
data$test_xgb <- xgb.DMatrix(data = data.matrix(data$test[,-which(colnames(data$train) == TARGET_VAR)]), label=data$test$target) | |
#define a watchlist | |
models$xgb_watchlist <- list(train=data$train_xgb, test=data$validation_xgb) | |
models$xgb = xgb.train( | |
params=list( | |
"booster" = "gbtree", | |
"objective" = "binary:logistic", | |
"eval_metric" = "error", | |
"eval_metric" = "logloss", | |
"eval_metric" = "auc", | |
"max_depth" = 15, | |
"silent" = -1 | |
), | |
data = data$train_xgb, | |
watchlist=models$xgb_watchlist, | |
nrounds = 100, | |
verbose = 2, | |
seed=123 | |
) | |
"max_depth" = 4, | |
"subsample" = 0.5, | |
"colsample_bytree" = 0.7, | |
"gamma"= 0.1, | |
predictions$validation$xgb <- predict(models$xgb,data.matrix(data$validation)) | |
predictions$validation$xgb <- factor(round(predictions$validation$xgb),levels=c(0,1),labels=c("VSE","SME")) | |
confusionMatrix(predictions$validation$xgb,target$validation,positive="SME") | |
#test (open opps) | |
predictions$test$xgb <- predict(models$xgb,data.matrix(data$test)) | |
predictions$test$xgb <- factor(round(predictions$test$xgb),levels=c(0,1),labels=c("VSE","SME")) | |
confusionMatrix(predictions$test$xgb,target$test,positive="SME") | |
############## | |
# TEST | |
############## | |
truePositiveRates <- c( | |
"XGB"=sensitivity(predictions$test$xgb,target$test), | |
"RF"=sensitivity(predictions$test$RF,target$test) | |
) | |
falsePositiveRates <- c( | |
"XGB"=1-specificity(predictions$test$xgb,target$test), | |
"RF"=1-specificity(predictions$test$RF,target$test) | |
) | |
#improvised ROC plot, need to adjust the scales, otherwise not useful | |
plot(x=falsePositiveRates, | |
y=truePositiveRates, | |
xlim=c(0,1), | |
ylim=c(0,1), | |
main="Model accuracy plot", | |
xlab= "False Negative Rate \n (% failed to classified as SME when SME)", | |
ylab= "True Positive Rate \n (% correctly classified from all cases)", pch = 19, cex = 1) | |
abline(0,1) | |
text(x=falsePositiveRates+0.05,y=truePositiveRates,names(truePositiveRates),cex=0.5) | |
############## | |
# EXAMINE MISSCLASSIFIED | |
############## | |
residuals <- data.frame( | |
"prob" = predict(models$RF,data$validation,type="prob"), | |
"true" = data$validation$target | |
) | |
residuals$dev <- residuals$true - residuals$prob.SME | |
residuals$fneg <- residuals$dev > 0 | |
residuals$fneg_strong <- residuals$dev > 0.4 | |
table(residuals$fneg_strong) | |
hist(residuals$dev) | |
misfits <- data$validation[residuals$fneg_strong,] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment