Created
July 22, 2021 14:10
-
-
Save marpontes/c58ccbd0b542ab874e55891085c42f90 to your computer and use it in GitHub Desktop.
Bash function that overcomes the limit of 32 files that `gsutil compose` can handle.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Based on: | |
# - https://stackoverflow.com/questions/66482517/how-to-append-more-than-33-files-in-a-gcloud-bucket#answer-66484002 | |
GSUTIL="gsutil compose" | |
BATCH_SIZE="32" | |
BUCKET_BASE="gs://my-shiny-bucket/path/to/intermediate/composites" | |
# Will compose files in groups of maximum 32 | |
# Ordered so that the last composite will contain all the previous files | |
# and will output to the last original parameter | |
function squish() { | |
LST=("$@") | |
LEN=${#LST[@]} | |
if [ "${LEN}" -le "1" ]; then | |
# Empty array; nothing to do | |
return 0 | |
fi | |
# Only unique for this configuration; be careful | |
COMPOSITE=$(printf "${BUCKET_BASE}/composite-%04d" ${LEN}) | |
if [ "${LEN}" -le "${BATCH_SIZE}" ]; then | |
# Batch can be composed with one command | |
echo "${GSUTIL} ${LST[@]}" | |
${GSUTIL} ${LST[@]} | |
return 0 | |
fi | |
# Compose 1st batch of files | |
# NB Provide start:size | |
echo "${GSUTIL} ${LST[@]:0:${BATCH_SIZE}} ${COMPOSITE}" | |
${GSUTIL} ${LST[@]:0:${BATCH_SIZE}} ${COMPOSITE} | |
# Remove batch from LST | |
# NB Provide start (to end is implied) | |
REM=${LST[@]:${BATCH_SIZE}} | |
# Prepend composite from above batch to the next run | |
NXT=(${COMPOSITE} ${REM[@]}) | |
squish "${NXT[@]}" | |
} | |
# Example run: | |
squish gs://bucket/key/file1.csv ... gs://bucket/key/file50.csv gs://bucket/key/merged.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Using a specific filename to produce the final output, one could simply run a script like this to remove the intermediate files: