IyarLin · April 12, 2021 10:23
diff --git a/dtplyr_speed_benchmarking.R b/dtplyr_speed_benchmarking.R
 require(data.table)
 require(dtplyr)
 require(dplyr)

 N <- 1e8
 K <- 100
 set.seed(1)
 DT <- data.table(
  id1 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
  id2 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
  id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), # small groups (char)
  id4 = sample(K, N, TRUE), # large groups (int)
  id5 = sample(K, N, TRUE), # large groups (int)
  id6 = sample(N / K, N, TRUE), # small groups (int)
  v1 = sample(5, N, TRUE), # int in range [1,5]
  v2 = sample(5, N, TRUE), # int in range [1,5]
  v3 = sample(round(runif(100, max = 100), 4), N, TRUE) # numeric e.g. 23.5749
 )

 q1a <- system.time(DT[, sum(v1), keyby = id1])[3]
 q1b <- system.time(DT[, sum(v1), keyby = id1])[3]
 q2a <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
 q2b <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
 q3a <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
 q3b <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
 q4a <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
 q4b <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
 q5a <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
 q5b <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
 data_table_results <- list(
  q1a = q1a, q1b = q1b,
  q2a = q2a, q2b = q2b,
  q3a = q3a, q3b = q3b,
  q4a = q4a, q4b = q4b,
  q5a = q5a, q5b = q5b
 )

 DF <- data.frame(
  stringsAsFactors = FALSE,
  id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
  id4 = sample(K, N, TRUE),
  id5 = sample(K, N, TRUE),
  id6 = sample(N / K, N, TRUE),
  v1 = sample(5, N, TRUE),
  v2 = sample(5, N, TRUE),
  v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
 )

 q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
  summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
 q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

 dtplyr_results <- list(
  q1a = q1a, q1b = q1b,
  q2a = q2a, q2b = q2b,
  q3a = q3a, q3b = q3b,
  q4a = q4a, q4b = q4b,
  q5a = q5a, q5b = q5b
 )

 DF <- data.frame(
  stringsAsFactors = FALSE,
  id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
  id4 = sample(K, N, TRUE),
  id5 = sample(K, N, TRUE),
  id6 = sample(N / K, N, TRUE),
  v1 = sample(5, N, TRUE),
  v2 = sample(5, N, TRUE),
  v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
 )

 DF <- as.data.table(DF)

 q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
  summarise(sum(v1)) %>% as_tibble())[3]
 q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
  summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
 q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

 dt_dtplyr_results <- list(
  q1a = q1a, q1b = q1b,
  q2a = q2a, q2b = q2b,
  q3a = q3a, q3b = q3b,
  q4a = q4a, q4b = q4b,
  q5a = q5a, q5b = q5b
 )

 DF <- data.frame(
  stringsAsFactors = FALSE,
  id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
  id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
  id4 = sample(K, N, TRUE),
  id5 = sample(K, N, TRUE),
  id6 = sample(N / K, N, TRUE),
  v1 = sample(5, N, TRUE),
  v2 = sample(5, N, TRUE),
  v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
 )

 q1a <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
 q1b <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
 q2a <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
 q2b <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
 q3a <- system.time(DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q3b <- system.time(DF %>% group_by(id3) %>%
  summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
 q4a <- system.time(DF %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q4b <- system.time(DF %>% group_by(id4) %>%
  summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
 q5a <- system.time(DF %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
 q5b <- system.time(DF %>% group_by(id6) %>%
  summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

 dplyr_results <- list(
  q1a = q1a, q1b = q1b,
  q2a = q2a, q2b = q2b,
  q3a = q3a, q3b = q3b,
  q4a = q4a, q4b = q4b,
  q5a = q5a, q5b = q5b
 )

 results <- data.frame(
  package = rep(c("data.table", "dtplyr", "dt_dtplyr", "dplyr"), each = 10),
  query = rep(rep(c(
    "Test 1: 100 ad hoc groups of 1,000,000 rows; result 100 X 2",
    "Test 2: 10,000 ad hoc groups of 10,000 rows; result 10,000 X 3",
    "Test 3: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 3",
    "Test 4: 100 ad hoc groups of 1,000,000 rows; result 100 X 4",
    "Test 5: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 4"
  ),
  each = 2
  ), 4),
  run = rep(rep(1:2, 5), 4),
  time = c(
    unlist(data_table_results), unlist(dtplyr_results),
    unlist(dt_dtplyr_results), unlist(dplyr_results)
  )
 )

 first_run <- results %>% filter(run == 1)
 second_run <- results %>% filter(run == 2)

 first_run %>% ggplot(aes(time, package, fill = package)) +
  geom_col(width = 0.5) +
  geom_col(data = second_run, alpha = 0.5) +
  facet_wrap(facets = vars(query), strip.position = "top", ncol = 1) +
  theme_bw() + xlab("Time elapsed (seconds)") +
  theme(
    axis.text.y = element_blank(),
    strip.text = element_text(size = 6),
    text = element_text(size = 20),
    axis.title.y = element_blank(),
    plot.title = element_text(size = 10),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank()
  ) +
  ggtitle("Input table: 100,000,000 rows X 9 columns (4.9 Gb) - Random Order") +
  scale_x_continuous(expand = c(0, 1.05))

 sessionInfo()
	require(data.table)
	require(dtplyr)
	require(dplyr)

	N <- 1e8
	K <- 100
	set.seed(1)
	DT <- data.table(
	id1 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
	id2 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
	id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), # small groups (char)
	id4 = sample(K, N, TRUE), # large groups (int)
	id5 = sample(K, N, TRUE), # large groups (int)
	id6 = sample(N / K, N, TRUE), # small groups (int)
	v1 = sample(5, N, TRUE), # int in range [1,5]
	v2 = sample(5, N, TRUE), # int in range [1,5]
	v3 = sample(round(runif(100, max = 100), 4), N, TRUE) # numeric e.g. 23.5749
	)

	q1a <- system.time(DT[, sum(v1), keyby = id1])[3]
	q1b <- system.time(DT[, sum(v1), keyby = id1])[3]
	q2a <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
	q2b <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
	q3a <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
	q3b <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
	q4a <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
	q4b <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
	q5a <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
	q5b <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
	data_table_results <- list(
	q1a = q1a, q1b = q1b,
	q2a = q2a, q2b = q2b,
	q3a = q3a, q3b = q3b,
	q4a = q4a, q4b = q4b,
	q5a = q5a, q5b = q5b
	)

	DF <- data.frame(
	stringsAsFactors = FALSE,
	id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
	id4 = sample(K, N, TRUE),
	id5 = sample(K, N, TRUE),
	id6 = sample(N / K, N, TRUE),
	v1 = sample(5, N, TRUE),
	v2 = sample(5, N, TRUE),
	v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
	)

	q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
	summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
	q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

	dtplyr_results <- list(
	q1a = q1a, q1b = q1b,
	q2a = q2a, q2b = q2b,
	q3a = q3a, q3b = q3b,
	q4a = q4a, q4b = q4b,
	q5a = q5a, q5b = q5b
	)

	DF <- data.frame(
	stringsAsFactors = FALSE,
	id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
	id4 = sample(K, N, TRUE),
	id5 = sample(K, N, TRUE),
	id6 = sample(N / K, N, TRUE),
	v1 = sample(5, N, TRUE),
	v2 = sample(5, N, TRUE),
	v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
	)

	DF <- as.data.table(DF)

	q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
	summarise(sum(v1)) %>% as_tibble())[3]
	q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
	summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
	q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

	dt_dtplyr_results <- list(
	q1a = q1a, q1b = q1b,
	q2a = q2a, q2b = q2b,
	q3a = q3a, q3b = q3b,
	q4a = q4a, q4b = q4b,
	q5a = q5a, q5b = q5b
	)

	DF <- data.frame(
	stringsAsFactors = FALSE,
	id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
	id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
	id4 = sample(K, N, TRUE),
	id5 = sample(K, N, TRUE),
	id6 = sample(N / K, N, TRUE),
	v1 = sample(5, N, TRUE),
	v2 = sample(5, N, TRUE),
	v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
	)

	q1a <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
	q1b <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
	q2a <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
	q2b <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
	q3a <- system.time(DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q3b <- system.time(DF %>% group_by(id3) %>%
	summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
	q4a <- system.time(DF %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q4b <- system.time(DF %>% group_by(id4) %>%
	summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
	q5a <- system.time(DF %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
	q5b <- system.time(DF %>% group_by(id6) %>%
	summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]

	dplyr_results <- list(
	q1a = q1a, q1b = q1b,
	q2a = q2a, q2b = q2b,
	q3a = q3a, q3b = q3b,
	q4a = q4a, q4b = q4b,
	q5a = q5a, q5b = q5b
	)

	results <- data.frame(
	package = rep(c("data.table", "dtplyr", "dt_dtplyr", "dplyr"), each = 10),
	query = rep(rep(c(
	"Test 1: 100 ad hoc groups of 1,000,000 rows; result 100 X 2",
	"Test 2: 10,000 ad hoc groups of 10,000 rows; result 10,000 X 3",
	"Test 3: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 3",
	"Test 4: 100 ad hoc groups of 1,000,000 rows; result 100 X 4",
	"Test 5: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 4"
	),
	each = 2
	), 4),
	run = rep(rep(1:2, 5), 4),
	time = c(
	unlist(data_table_results), unlist(dtplyr_results),
	unlist(dt_dtplyr_results), unlist(dplyr_results)
	)
	)

	first_run <- results %>% filter(run == 1)
	second_run <- results %>% filter(run == 2)

	first_run %>% ggplot(aes(time, package, fill = package)) +
	geom_col(width = 0.5) +
	geom_col(data = second_run, alpha = 0.5) +
	facet_wrap(facets = vars(query), strip.position = "top", ncol = 1) +
	theme_bw() + xlab("Time elapsed (seconds)") +
	theme(
	axis.text.y = element_blank(),
	strip.text = element_text(size = 6),
	text = element_text(size = 20),
	axis.title.y = element_blank(),
	plot.title = element_text(size = 10),
	axis.ticks.y = element_blank(),
	panel.grid.major.y = element_blank()
	) +
	ggtitle("Input table: 100,000,000 rows X 9 columns (4.9 Gb) - Random Order") +
	scale_x_continuous(expand = c(0, 1.05))

	sessionInfo()