Skip to content

Instantly share code, notes, and snippets.

@IyarLin
Created April 12, 2021 10:23
Show Gist options
  • Save IyarLin/df8cdc6b277d957ea3df7ab9c8d88de4 to your computer and use it in GitHub Desktop.
Save IyarLin/df8cdc6b277d957ea3df7ab9c8d88de4 to your computer and use it in GitHub Desktop.
dtplyr speed benchmarking
require(data.table)
require(dtplyr)
require(dplyr)
N <- 1e8
K <- 100
set.seed(1)
DT <- data.table(
id1 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
id2 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char)
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), # small groups (char)
id4 = sample(K, N, TRUE), # large groups (int)
id5 = sample(K, N, TRUE), # large groups (int)
id6 = sample(N / K, N, TRUE), # small groups (int)
v1 = sample(5, N, TRUE), # int in range [1,5]
v2 = sample(5, N, TRUE), # int in range [1,5]
v3 = sample(round(runif(100, max = 100), 4), N, TRUE) # numeric e.g. 23.5749
)
q1a <- system.time(DT[, sum(v1), keyby = id1])[3]
q1b <- system.time(DT[, sum(v1), keyby = id1])[3]
q2a <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
q2b <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3]
q3a <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
q3b <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3]
q4a <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
q4b <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3]
q5a <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
q5b <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3]
data_table_results <- list(
q1a = q1a, q1b = q1b,
q2a = q2a, q2b = q2b,
q3a = q3a, q3b = q3b,
q4a = q4a, q4b = q4b,
q5a = q5a, q5b = q5b
)
DF <- data.frame(
stringsAsFactors = FALSE,
id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
id4 = sample(K, N, TRUE),
id5 = sample(K, N, TRUE),
id6 = sample(N / K, N, TRUE),
v1 = sample(5, N, TRUE),
v2 = sample(5, N, TRUE),
v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
)
q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
dtplyr_results <- list(
q1a = q1a, q1b = q1b,
q2a = q2a, q2b = q2b,
q3a = q3a, q3b = q3b,
q4a = q4a, q4b = q4b,
q5a = q5a, q5b = q5b
)
DF <- data.frame(
stringsAsFactors = FALSE,
id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
id4 = sample(K, N, TRUE),
id5 = sample(K, N, TRUE),
id6 = sample(N / K, N, TRUE),
v1 = sample(5, N, TRUE),
v2 = sample(5, N, TRUE),
v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
)
DF <- as.data.table(DF)
q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>%
summarise(sum(v1)) %>% as_tibble())[3]
q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>%
summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
dt_dtplyr_results <- list(
q1a = q1a, q1b = q1b,
q2a = q2a, q2b = q2b,
q3a = q3a, q3b = q3b,
q4a = q4a, q4b = q4b,
q5a = q5a, q5b = q5b
)
DF <- data.frame(
stringsAsFactors = FALSE,
id1 = sample(sprintf("id%03d", 1:K), N, TRUE),
id2 = sample(sprintf("id%03d", 1:K), N, TRUE),
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE),
id4 = sample(K, N, TRUE),
id5 = sample(K, N, TRUE),
id6 = sample(N / K, N, TRUE),
v1 = sample(5, N, TRUE),
v2 = sample(5, N, TRUE),
v3 = sample(round(runif(100, max = 100), 4), N, TRUE)
)
q1a <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
q1b <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3]
q2a <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
q2b <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3]
q3a <- system.time(DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q3b <- system.time(DF %>% group_by(id3) %>%
summarise(sum(v1), mean(v3)) %>% as_tibble())[3]
q4a <- system.time(DF %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q4b <- system.time(DF %>% group_by(id4) %>%
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3]
q5a <- system.time(DF %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
q5b <- system.time(DF %>% group_by(id6) %>%
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3]
dplyr_results <- list(
q1a = q1a, q1b = q1b,
q2a = q2a, q2b = q2b,
q3a = q3a, q3b = q3b,
q4a = q4a, q4b = q4b,
q5a = q5a, q5b = q5b
)
results <- data.frame(
package = rep(c("data.table", "dtplyr", "dt_dtplyr", "dplyr"), each = 10),
query = rep(rep(c(
"Test 1: 100 ad hoc groups of 1,000,000 rows; result 100 X 2",
"Test 2: 10,000 ad hoc groups of 10,000 rows; result 10,000 X 3",
"Test 3: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 3",
"Test 4: 100 ad hoc groups of 1,000,000 rows; result 100 X 4",
"Test 5: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 4"
),
each = 2
), 4),
run = rep(rep(1:2, 5), 4),
time = c(
unlist(data_table_results), unlist(dtplyr_results),
unlist(dt_dtplyr_results), unlist(dplyr_results)
)
)
first_run <- results %>% filter(run == 1)
second_run <- results %>% filter(run == 2)
first_run %>% ggplot(aes(time, package, fill = package)) +
geom_col(width = 0.5) +
geom_col(data = second_run, alpha = 0.5) +
facet_wrap(facets = vars(query), strip.position = "top", ncol = 1) +
theme_bw() + xlab("Time elapsed (seconds)") +
theme(
axis.text.y = element_blank(),
strip.text = element_text(size = 6),
text = element_text(size = 20),
axis.title.y = element_blank(),
plot.title = element_text(size = 10),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank()
) +
ggtitle("Input table: 100,000,000 rows X 9 columns (4.9 Gb) - Random Order") +
scale_x_continuous(expand = c(0, 1.05))
sessionInfo()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment