Created
April 12, 2021 10:23
-
-
Save IyarLin/df8cdc6b277d957ea3df7ab9c8d88de4 to your computer and use it in GitHub Desktop.
dtplyr speed benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(data.table) | |
require(dtplyr) | |
require(dplyr) | |
N <- 1e8 | |
K <- 100 | |
set.seed(1) | |
DT <- data.table( | |
id1 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char) | |
id2 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char) | |
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), # small groups (char) | |
id4 = sample(K, N, TRUE), # large groups (int) | |
id5 = sample(K, N, TRUE), # large groups (int) | |
id6 = sample(N / K, N, TRUE), # small groups (int) | |
v1 = sample(5, N, TRUE), # int in range [1,5] | |
v2 = sample(5, N, TRUE), # int in range [1,5] | |
v3 = sample(round(runif(100, max = 100), 4), N, TRUE) # numeric e.g. 23.5749 | |
) | |
q1a <- system.time(DT[, sum(v1), keyby = id1])[3] | |
q1b <- system.time(DT[, sum(v1), keyby = id1])[3] | |
q2a <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3] | |
q2b <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3] | |
q3a <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3] | |
q3b <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3] | |
q4a <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3] | |
q4b <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3] | |
q5a <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3] | |
q5b <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3] | |
data_table_results <- list( | |
q1a = q1a, q1b = q1b, | |
q2a = q2a, q2b = q2b, | |
q3a = q3a, q3b = q3b, | |
q4a = q4a, q4b = q4b, | |
q5a = q5a, q5b = q5b | |
) | |
DF <- data.frame( | |
stringsAsFactors = FALSE, | |
id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
id4 = sample(K, N, TRUE), | |
id5 = sample(K, N, TRUE), | |
id6 = sample(N / K, N, TRUE), | |
v1 = sample(5, N, TRUE), | |
v2 = sample(5, N, TRUE), | |
v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
) | |
q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% | |
summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
dtplyr_results <- list( | |
q1a = q1a, q1b = q1b, | |
q2a = q2a, q2b = q2b, | |
q3a = q3a, q3b = q3b, | |
q4a = q4a, q4b = q4b, | |
q5a = q5a, q5b = q5b | |
) | |
DF <- data.frame( | |
stringsAsFactors = FALSE, | |
id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
id4 = sample(K, N, TRUE), | |
id5 = sample(K, N, TRUE), | |
id6 = sample(N / K, N, TRUE), | |
v1 = sample(5, N, TRUE), | |
v2 = sample(5, N, TRUE), | |
v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
) | |
DF <- as.data.table(DF) | |
q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
summarise(sum(v1)) %>% as_tibble())[3] | |
q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% | |
summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
dt_dtplyr_results <- list( | |
q1a = q1a, q1b = q1b, | |
q2a = q2a, q2b = q2b, | |
q3a = q3a, q3b = q3b, | |
q4a = q4a, q4b = q4b, | |
q5a = q5a, q5b = q5b | |
) | |
DF <- data.frame( | |
stringsAsFactors = FALSE, | |
id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
id4 = sample(K, N, TRUE), | |
id5 = sample(K, N, TRUE), | |
id6 = sample(N / K, N, TRUE), | |
v1 = sample(5, N, TRUE), | |
v2 = sample(5, N, TRUE), | |
v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
) | |
q1a <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
q1b <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
q2a <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
q2b <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
q3a <- system.time(DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q3b <- system.time(DF %>% group_by(id3) %>% | |
summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
q4a <- system.time(DF %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q4b <- system.time(DF %>% group_by(id4) %>% | |
summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
q5a <- system.time(DF %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
q5b <- system.time(DF %>% group_by(id6) %>% | |
summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
dplyr_results <- list( | |
q1a = q1a, q1b = q1b, | |
q2a = q2a, q2b = q2b, | |
q3a = q3a, q3b = q3b, | |
q4a = q4a, q4b = q4b, | |
q5a = q5a, q5b = q5b | |
) | |
results <- data.frame( | |
package = rep(c("data.table", "dtplyr", "dt_dtplyr", "dplyr"), each = 10), | |
query = rep(rep(c( | |
"Test 1: 100 ad hoc groups of 1,000,000 rows; result 100 X 2", | |
"Test 2: 10,000 ad hoc groups of 10,000 rows; result 10,000 X 3", | |
"Test 3: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 3", | |
"Test 4: 100 ad hoc groups of 1,000,000 rows; result 100 X 4", | |
"Test 5: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 4" | |
), | |
each = 2 | |
), 4), | |
run = rep(rep(1:2, 5), 4), | |
time = c( | |
unlist(data_table_results), unlist(dtplyr_results), | |
unlist(dt_dtplyr_results), unlist(dplyr_results) | |
) | |
) | |
first_run <- results %>% filter(run == 1) | |
second_run <- results %>% filter(run == 2) | |
first_run %>% ggplot(aes(time, package, fill = package)) + | |
geom_col(width = 0.5) + | |
geom_col(data = second_run, alpha = 0.5) + | |
facet_wrap(facets = vars(query), strip.position = "top", ncol = 1) + | |
theme_bw() + xlab("Time elapsed (seconds)") + | |
theme( | |
axis.text.y = element_blank(), | |
strip.text = element_text(size = 6), | |
text = element_text(size = 20), | |
axis.title.y = element_blank(), | |
plot.title = element_text(size = 10), | |
axis.ticks.y = element_blank(), | |
panel.grid.major.y = element_blank() | |
) + | |
ggtitle("Input table: 100,000,000 rows X 9 columns (4.9 Gb) - Random Order") + | |
scale_x_continuous(expand = c(0, 1.05)) | |
sessionInfo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment