Last active
October 27, 2020 13:13
-
-
Save noamross/1347b90da474be0b7e68308ea7ee7d10 to your computer and use it in GitHub Desktop.
A quick greedy algorithm to partition unequal-sized groups into near-equal shards
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Function to partition unequal sized groups into shards of similar size. | |
#' | |
#' Based on the greedy algorithm described in [The Wikipedia article on the | |
#' the partitioning problem](https://en.wikipedia.org/wiki/Partition_problem#The_greedy_algorithm) | |
#' | |
#' @param groups_vector An integer vector of group ids, such as a group ID | |
#' column in a data frame | |
#' @param n_shards The number of shards to split groups up into | |
#' @examples | |
#' n_groups <- 200 | |
#' groups_vector <- sample.int(n_groups, size = 3000000, | |
#' replace = TRUE, prob = runif(n_groups)) | |
#' time <- system.time(p <- partition(groups_vector, n_shards = 10)) | |
#' p$shard_sums | |
#' p$assigned_shards | |
#' time | |
#' \dontrun{\donttest{ | |
#' # Add a shard column to a data frame with a group integer column with | |
#' dplyr::mutate(df, shard = partition(group)$assigned_shards[group]) | |
#' }} | |
partition <- function(groups_vector, n_shards) { | |
stopifnot(is.integer(groups_vector)) | |
group_sizes <- sort(table(groups_vector), decreasing = TRUE) | |
n_groups <- length(group_sizes) | |
stopifnot(n_groups > n_shards) | |
assigned_shards <- rep(NA_integer_, length(group_sizes)) | |
assigned_shards[1:n_shards] <- 1:n_shards | |
shard_sums <- as.vector(group_sizes[1:n_shards]) #drop names to avoid confusion | |
for (i in (n_shards + 1):n_groups) { | |
z <- which.min(shard_sums) | |
shard_sums[z] <- shard_sums[z] + group_sizes[i] | |
assigned_shards[i] <- z | |
} | |
return(list( | |
shard_sums = shard_sums, | |
assigned_shards = assigned_shards | |
)) | |
} | |
n_groups <- 200 | |
groups_vector <- sample.int(n_groups, size = 3000000, | |
replace = TRUE, prob = runif(n_groups)) | |
time <- system.time(p <- partition(groups_vector, n_shards = 10)) | |
p$shard_sums | |
#> [1] 300429 299933 299636 299789 299899 299760 299843 300145 300460 300106 | |
p$assigned_shards | |
#> [1] 1 2 3 4 5 6 7 8 9 10 10 9 8 7 6 5 4 3 2 1 3 4 5 6 2 | |
#> [26] 10 7 1 9 8 8 3 9 1 4 5 7 2 10 6 6 10 2 7 5 4 8 1 9 3 | |
#> [51] 3 9 1 4 8 7 5 6 2 10 10 2 6 1 5 7 9 8 4 3 3 4 8 9 10 | |
#> [76] 2 7 1 6 5 3 4 8 9 6 1 7 10 5 2 2 5 10 7 1 6 9 8 3 4 | |
#> [101] 1 7 4 10 5 2 6 9 3 8 8 3 9 6 2 5 10 4 7 1 6 4 3 10 7 | |
#> [126] 2 9 5 1 8 8 5 1 7 2 9 10 3 4 6 6 4 3 10 9 2 8 7 5 1 | |
#> [151] 10 9 1 5 7 6 4 3 2 8 3 4 6 8 7 2 5 1 9 10 9 10 1 5 2 | |
#> [176] 8 7 6 3 4 4 3 6 7 8 2 5 1 9 10 9 1 8 10 2 5 7 4 6 3 | |
time | |
#> user system elapsed | |
#> 0.431 0.079 0.510 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment