jtilly · May 12, 2018 08:16
diff --git a/check-lgb-multiclass.R b/check-lgb-multiclass.R
 library(lightgbm)
 library(tidyverse)

 rm(list = ls())

 # We load the default iris dataset shipped with R
 data(iris)

 iris = as_data_frame(iris) %>% 
    mutate(Species = as.numeric(factor(Species)) - 1) %>% 
    filter(Species < 2) %>% 
    group_by(Species) %>% 
    dplyr::slice(1:2) %>% 
    ungroup()

 x = as.matrix(iris %>% select(-Species))
 y = iris %>% pull(Species)

 dtrain <- lgb.Dataset(data = x, label = y)

 custom_multiclass_obj = function(preds, dtrain) {
    labels = getinfo(dtrain, "label")
    
    # print preds
    print(data_frame(preds))
    
    # preds is a matrix with rows corresponding to samples and colums corresponding to choices
    preds = matrix(preds, nrow = length(labels))
    
    # to prevent overflow, normalize preds by row
    preds = preds - apply(preds, 1, max)
    prob = exp(preds) / rowSums(exp(preds))
    
    # compute gradient
    grad = prob
    grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1
    
    # compute hessian (approximation)
    hess = 2 * prob * (1 - prob)
    
    return(list(grad = grad, hess = hess))
 }

 # define custom metric 
 custom_multiclass_metric = function(preds, dtrain) {
    
    labels = getinfo(dtrain, "label")
    preds = matrix(preds, nrow = length(labels))
    preds = preds - apply(preds, 1, max)
    prob = exp(preds) / rowSums(exp(preds))
    
    return(list(name = "error",
                value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
                higher_better = FALSE))
    
 }

 setinfo(dtrain, "init_score", c(0, 0, 0, 0, 0, 0, 0, 0))

 # Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
 model1 <- lgb.train(list(),
                    dtrain,
                    nrounds = 2,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

 # # A tibble: 8 x 1
 # preds
 # <dbl>
 # 1  0.333
 # 2  0.333
 # 3 -1.000
 # 4  0.333
 # 5 -0.333
 # 6 -0.333
 # 7  1.000
 # 8 -0.333

 # Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
 # should be identical to the predictions from above
 model2 <- lgb.train(list(),
                    dtrain,
                    nrounds = 1,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

 print(data_frame(predict(model2, x)))

 # `predict(model2, x)`
 # <dbl>
 # 1                0.333
 # 2               -0.333
 # 3                0.333
 # 4               -0.333
 # 5               -1.000
 # 6                1.000
 # 7                0.333
 # 8               -0.333

 # Note that the order is wrong! We need a reshape:

 print(data_frame(reshape = predict(model2, x, reshape = TRUE) %>% as.vector()))

 # # A tibble: 8 x 1
 # reshape
 # <dbl>
 # 1   0.333
 # 2   0.333
 # 3  -1.000
 # 4   0.333
 # 5  -0.333
 # 6  -0.333
 # 7   1.000
 # 8  -0.333

 # Conclusion: predictions that we obtain from R's predict function and the vector called `preds` 
 # inside the custom objective function are stored in different orders!

 # This also affects how we deal with base margins! Base margins follow the logic of R's predict
 # function.

 # All of this is due to the lovely helper functions RowFunctionFromDenseMatric in C Api that 
 # appears to be applied inconsistently.

 # Check base margins

 # Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
 model1 <- lgb.train(list(),
                    dtrain,
                    nrounds = 2,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

 predict(model1, data = x, num_iteration = 1, reshape = TRUE) %>% as.vector()
 # [1]  0.3333333  0.3333333 -1.0000000  0.3333333 -0.3333333 -0.3333333  1.0000000 -0.3333333

 base_margin = c(0.333, 0.333, -1.000, 0.333, -0.333, -0.333, 1.000, -0.333)
 setinfo(dtrain, "init_score", base_margin)

 # Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
 # should be identical to the predictions from above
 model2 <- lgb.train(list(),
                    dtrain,
                    nrounds = 1,
                    min_data = 1,
                    learning_rate = 1,
                    num_leaves = 2,
                    objective = custom_multiclass_obj,
                    eval = custom_multiclass_metric,
                    num_class = 2)

 print(data_frame(predict(model1, x, reshape = TRUE) %>% as.vector()))
 print(data_frame(predict(model2, x, reshape = TRUE) %>% as.vector() + base_margin))
	library(lightgbm)
	library(tidyverse)

	rm(list = ls())

	# We load the default iris dataset shipped with R
	data(iris)

	iris = as_data_frame(iris) %>%
	mutate(Species = as.numeric(factor(Species)) - 1) %>%
	filter(Species < 2) %>%
	group_by(Species) %>%
	dplyr::slice(1:2) %>%
	ungroup()

	x = as.matrix(iris %>% select(-Species))
	y = iris %>% pull(Species)

	dtrain <- lgb.Dataset(data = x, label = y)

	custom_multiclass_obj = function(preds, dtrain) {
	labels = getinfo(dtrain, "label")

	# print preds
	print(data_frame(preds))

	# preds is a matrix with rows corresponding to samples and colums corresponding to choices
	preds = matrix(preds, nrow = length(labels))

	# to prevent overflow, normalize preds by row
	preds = preds - apply(preds, 1, max)
	prob = exp(preds) / rowSums(exp(preds))

	# compute gradient
	grad = prob
	grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1

	# compute hessian (approximation)
	hess = 2 * prob * (1 - prob)

	return(list(grad = grad, hess = hess))
	}

	# define custom metric
	custom_multiclass_metric = function(preds, dtrain) {

	labels = getinfo(dtrain, "label")
	preds = matrix(preds, nrow = length(labels))
	preds = preds - apply(preds, 1, max)
	prob = exp(preds) / rowSums(exp(preds))

	return(list(name = "error",
	value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
	higher_better = FALSE))

	}

	setinfo(dtrain, "init_score", c(0, 0, 0, 0, 0, 0, 0, 0))

	# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
	model1 <- lgb.train(list(),
	dtrain,
	nrounds = 2,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	# # A tibble: 8 x 1
	# preds
	# <dbl>
	# 1 0.333
	# 2 0.333
	# 3 -1.000
	# 4 0.333
	# 5 -0.333
	# 6 -0.333
	# 7 1.000
	# 8 -0.333

	# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
	# should be identical to the predictions from above
	model2 <- lgb.train(list(),
	dtrain,
	nrounds = 1,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	print(data_frame(predict(model2, x)))

	# `predict(model2, x)`
	# <dbl>
	# 1 0.333
	# 2 -0.333
	# 3 0.333
	# 4 -0.333
	# 5 -1.000
	# 6 1.000
	# 7 0.333
	# 8 -0.333

	# Note that the order is wrong! We need a reshape:

	print(data_frame(reshape = predict(model2, x, reshape = TRUE) %>% as.vector()))

	# # A tibble: 8 x 1
	# reshape
	# <dbl>
	# 1 0.333
	# 2 0.333
	# 3 -1.000
	# 4 0.333
	# 5 -0.333
	# 6 -0.333
	# 7 1.000
	# 8 -0.333

	# Conclusion: predictions that we obtain from R's predict function and the vector called `preds`
	# inside the custom objective function are stored in different orders!

	# This also affects how we deal with base margins! Base margins follow the logic of R's predict
	# function.

	# All of this is due to the lovely helper functions RowFunctionFromDenseMatric in C Api that
	# appears to be applied inconsistently.

	# Check base margins

	# Estimate model with nrounds = 2, check out predictions at the beginning of round 2:
	model1 <- lgb.train(list(),
	dtrain,
	nrounds = 2,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	predict(model1, data = x, num_iteration = 1, reshape = TRUE) %>% as.vector()
	# [1] 0.3333333 0.3333333 -1.0000000 0.3333333 -0.3333333 -0.3333333 1.0000000 -0.3333333

	base_margin = c(0.333, 0.333, -1.000, 0.333, -0.333, -0.333, 1.000, -0.333)
	setinfo(dtrain, "init_score", base_margin)

	# Estimate model with nrounds = 1, check out final predictions; because the learning_rate is equal to one, these
	# should be identical to the predictions from above
	model2 <- lgb.train(list(),
	dtrain,
	nrounds = 1,
	min_data = 1,
	learning_rate = 1,
	num_leaves = 2,
	objective = custom_multiclass_obj,
	eval = custom_multiclass_metric,
	num_class = 2)

	print(data_frame(predict(model1, x, reshape = TRUE) %>% as.vector()))
	print(data_frame(predict(model2, x, reshape = TRUE) %>% as.vector() + base_margin))