mrl22 · January 13, 2026 16:34
diff --git a/README.md b/README.md
diff --git a/backup.sh b/backup.sh
 #!/usr/bin/env bash
 set -euo pipefail

 export AWS_ACCESS_KEY_ID=''
 export AWS_SECRET_ACCESS_KEY=''
 export AWS_PROFILE='default'
 export AWS_REGION='lon1.digitaloceanspaces.com'

 export AWS_ENDPOINT_URL="https://${AWS_REGION}"
 export S3_ENDPOINT_URL="$AWS_ENDPOINT_URL"

 DEST_ROOT="${DEST_ROOT:-$HOME}"
 mkdir -p "$DEST_ROOT"

 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"

 # Regeneration switches (set to 0 to reuse existing files)
 REGEN_LISTING="${REGEN_LISTING:-1}"
 REGEN_REMOTE_MANIFEST="${REGEN_REMOTE_MANIFEST:-1}"
 REGEN_LOCAL_MANIFEST="${REGEN_LOCAL_MANIFEST:-1}"
 REGEN_JOINED="${REGEN_JOINED:-1}"
 REGEN_OUTPUTS="${REGEN_OUTPUTS:-1}"

 if [ "$#" -eq 0 ]; then
  echo "Usage: $0 bucket1 bucket2 ..."
  exit 1
 fi

 buckets=()
 for b in "$@"; do
  buckets+=("s3://$b")
 done

 S5_NUMWORKERS="${S5_NUMWORKERS:-6}"

 failed=()

 for bucket in "${buckets[@]}"; do
  bucket_name="${bucket#s3://}"
  bucket_dir="$DEST_ROOT/$bucket_name"
  mkdir -p "$bucket_dir"

  listing_json="$SCRIPT_DIR/${bucket_name}.listing.json"
  remote_tsv="$SCRIPT_DIR/${bucket_name}.remote.tsv"
  local_tsv="$SCRIPT_DIR/${bucket_name}.local.tsv"
  joined_tsv="$SCRIPT_DIR/${bucket_name}.joined.tsv"
  dirs_file="$SCRIPT_DIR/${bucket_name}.dirs.txt"
  commands_file="$SCRIPT_DIR/${bucket_name}.commands.txt"

  # ---- listing (JSON) with live counter ----
  if [ "$REGEN_LISTING" -eq 1 ]; then
    echo "Fetching JSON listing for $bucket"
    : > "$listing_json"

    s5cmd --json ls "$bucket/*" | awk '
      BEGIN { start = systime(); count = 0 }
      {
        count++
        print $0
        if (count % 1000 == 0) {
          now = systime()
          elapsed = now - start
          if (elapsed > 0) {
            rate = count / elapsed
            printf "\rDownloaded %d JSON lines (%.0f/sec)", count, rate > "/dev/stderr"
          } else {
            printf "\rDownloaded %d JSON lines", count > "/dev/stderr"
          }
        }
      }
      END {
        if (count > 0) {
          printf "\rDownloaded %d JSON lines\n", count > "/dev/stderr"
        }
      }
    ' >> "$listing_json"
  else
    echo "Reusing existing JSON listing: $listing_json"
    if [ ! -s "$listing_json" ]; then
      echo "Listing JSON missing or empty: $listing_json"
      failed+=("$bucket")
      continue
    fi
  fi

  # ---- remote manifest (exact key, size) ----
  if [ "$REGEN_REMOTE_MANIFEST" -eq 1 ]; then
    echo "Building remote manifest"
    : > "$remote_tsv"
    jq -r --arg b "$bucket_name" '
      select(.type=="file") |
      (.key
        | sub("^s3://"; "")
        | sub("^" + $b + "/"; "")
      ) + "\t" + (.size|tostring)
    ' "$listing_json" > "$remote_tsv"
  else
    echo "Reusing remote manifest: $remote_tsv"
    if [ ! -s "$remote_tsv" ]; then
      echo "Remote manifest missing or empty: $remote_tsv"
      failed+=("$bucket")
      continue
    fi
  fi

  # ---- local manifest ----
  if [ "$REGEN_LOCAL_MANIFEST" -eq 1 ]; then
    echo "Building local manifest"
    : > "$local_tsv"
    if [ -d "$bucket_dir" ]; then
      (cd "$bucket_dir" && find . -type f -printf '%P\t%s\n') > "$local_tsv" || :
    fi
  else
    echo "Reusing local manifest: $local_tsv"
    if [ ! -f "$local_tsv" ]; then
      echo "Local manifest missing: $local_tsv"
      failed+=("$bucket")
      continue
    fi
  fi

  # ---- sort manifests (required for join) ----
  echo "Sorting manifests"
  LC_ALL=C sort -t $'\t' -k1,1 "$remote_tsv" -o "$remote_tsv"
  LC_ALL=C sort -t $'\t' -k1,1 "$local_tsv" -o "$local_tsv"

  # ---- join manifests ----
  if [ "$REGEN_JOINED" -eq 1 ]; then
    echo "Joining manifests"
    : > "$joined_tsv"
    join -t $'\t' -a 1 -e '' -o 1.1,1.2,2.2 "$remote_tsv" "$local_tsv" > "$joined_tsv"
  else
    echo "Reusing joined file: $joined_tsv"
    if [ ! -s "$joined_tsv" ]; then
      echo "Joined file missing or empty: $joined_tsv"
      failed+=("$bucket")
      continue
    fi
  fi

  # ---- reset outputs ----
  if [ "$REGEN_OUTPUTS" -eq 1 ]; then
    : > "$dirs_file"
    : > "$commands_file"
  else
    echo "Reusing outputs: $dirs_file and $commands_file"
  fi

  total_files=$(wc -l < "$joined_tsv" | tr -d ' ')
  echo "Total objects to scan: $total_files"
  echo "Building copy list for $bucket (size-only)"

  count=0
  start_ts=$(date +%s)

  while IFS=$'\t' read -r rel remote_size local_size; do
    count=$((count + 1))

    if (( count % 1000 == 0 )); then
      now=$(date +%s)
      elapsed=$((now - start_ts))

      if (( elapsed > 0 )); then
        rate=$((count / elapsed))
        if (( rate > 0 )); then
          remaining=$((total_files - count))
          eta=$((remaining / rate))
          printf '\rProcessed %d / %d | ETA %02d:%02d:%02d' \
            "$count" "$total_files" \
            $((eta/3600)) $(((eta%3600)/60)) $((eta%60))
        else
          printf '\rProcessed %d / %d | ETA --:--:--' "$count" "$total_files"
        fi
      else
        printf '\rProcessed %d / %d | ETA --:--:--' "$count" "$total_files"
      fi
    fi

    # Copy if missing locally or size differs
    if [ -z "${local_size:-}" ] || [ "$local_size" -ne "$remote_size" ]; then
      dest="$bucket_dir/$rel"
      printf '%s\n' "$(dirname "$dest")" >> "$dirs_file"
      printf 'cp "%s/%s" "%s"\n' "$bucket" "$rel" "$dest" >> "$commands_file"
    fi
  done < "$joined_tsv"

  echo

  files_to_copy=$(wc -l < "$commands_file" | tr -d ' ')
  echo "Files to copy: $files_to_copy"

  dirs_to_create=$(sort -u "$dirs_file" | wc -l | tr -d ' ')
  echo "Directories to create: $dirs_to_create"

  if [ "$files_to_copy" -eq 0 ]; then
    echo "Nothing to copy for $bucket"
    continue
  fi

  echo "Creating directories for $bucket"
  sort -u "$dirs_file" | xargs -r mkdir -p

  echo "Copying files for $bucket (workers: $S5_NUMWORKERS)"
  if ! s5cmd --numworkers "$S5_NUMWORKERS" run "$commands_file"; then
    echo "FAILED: $bucket"
    failed+=("$bucket")
  else
    echo "Done: $bucket"
  fi
 done

 if [ "${#failed[@]}" -ne 0 ]; then
  echo "Some buckets failed:"
  printf ' - %s\n' "${failed[@]}"
  exit 2
 fi

 echo "All buckets have been downloaded to $DEST_ROOT"
Variable	Description
`REGEN_LISTING`	Re-download the bucket listing from S3 (JSON format)
`REGEN_REMOTE_MANIFEST`	Rebuild the remote file list (path and size)
`REGEN_LOCAL_MANIFEST`	Re-scan the local filesystem
`REGEN_JOINED`	Rebuild the remote vs local comparison
`REGEN_OUTPUTS`	Regenerate the copy commands and directory list
	#!/usr/bin/env bash
	set -euo pipefail

	export AWS_ACCESS_KEY_ID=''
	export AWS_SECRET_ACCESS_KEY=''
	export AWS_PROFILE='default'
	export AWS_REGION='lon1.digitaloceanspaces.com'

	export AWS_ENDPOINT_URL="https://${AWS_REGION}"
	export S3_ENDPOINT_URL="$AWS_ENDPOINT_URL"

	DEST_ROOT="${DEST_ROOT:-$HOME}"
	mkdir -p "$DEST_ROOT"

	SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"

	# Regeneration switches (set to 0 to reuse existing files)
	REGEN_LISTING="${REGEN_LISTING:-1}"
	REGEN_REMOTE_MANIFEST="${REGEN_REMOTE_MANIFEST:-1}"
	REGEN_LOCAL_MANIFEST="${REGEN_LOCAL_MANIFEST:-1}"
	REGEN_JOINED="${REGEN_JOINED:-1}"
	REGEN_OUTPUTS="${REGEN_OUTPUTS:-1}"

	if [ "$#" -eq 0 ]; then
	echo "Usage: $0 bucket1 bucket2 ..."
	exit 1
	fi

	buckets=()
	for b in "$@"; do
	buckets+=("s3://$b")
	done

	S5_NUMWORKERS="${S5_NUMWORKERS:-6}"

	failed=()

	for bucket in "${buckets[@]}"; do
	bucket_name="${bucket#s3://}"
	bucket_dir="$DEST_ROOT/$bucket_name"
	mkdir -p "$bucket_dir"

	listing_json="$SCRIPT_DIR/${bucket_name}.listing.json"
	remote_tsv="$SCRIPT_DIR/${bucket_name}.remote.tsv"
	local_tsv="$SCRIPT_DIR/${bucket_name}.local.tsv"
	joined_tsv="$SCRIPT_DIR/${bucket_name}.joined.tsv"
	dirs_file="$SCRIPT_DIR/${bucket_name}.dirs.txt"
	commands_file="$SCRIPT_DIR/${bucket_name}.commands.txt"

	# ---- listing (JSON) with live counter ----
	if [ "$REGEN_LISTING" -eq 1 ]; then
	echo "Fetching JSON listing for $bucket"
	: > "$listing_json"

	s5cmd --json ls "$bucket/*" \| awk '
	BEGIN { start = systime(); count = 0 }
	{
	count++
	print $0
	if (count % 1000 == 0) {
	now = systime()
	elapsed = now - start
	if (elapsed > 0) {
	rate = count / elapsed
	printf "\rDownloaded %d JSON lines (%.0f/sec)", count, rate > "/dev/stderr"
	} else {
	printf "\rDownloaded %d JSON lines", count > "/dev/stderr"
	}
	}
	}
	END {
	if (count > 0) {
	printf "\rDownloaded %d JSON lines\n", count > "/dev/stderr"
	}
	}
	' >> "$listing_json"
	else
	echo "Reusing existing JSON listing: $listing_json"
	if [ ! -s "$listing_json" ]; then
	echo "Listing JSON missing or empty: $listing_json"
	failed+=("$bucket")
	continue
	fi
	fi

	# ---- remote manifest (exact key, size) ----
	if [ "$REGEN_REMOTE_MANIFEST" -eq 1 ]; then
	echo "Building remote manifest"
	: > "$remote_tsv"
	jq -r --arg b "$bucket_name" '
	select(.type=="file") \|
	(.key
	\| sub("^s3://"; "")
	\| sub("^" + $b + "/"; "")
	) + "\t" + (.size\|tostring)
	' "$listing_json" > "$remote_tsv"
	else
	echo "Reusing remote manifest: $remote_tsv"
	if [ ! -s "$remote_tsv" ]; then
	echo "Remote manifest missing or empty: $remote_tsv"
	failed+=("$bucket")
	continue
	fi
	fi

	# ---- local manifest ----
	if [ "$REGEN_LOCAL_MANIFEST" -eq 1 ]; then
	echo "Building local manifest"
	: > "$local_tsv"
	if [ -d "$bucket_dir" ]; then
	(cd "$bucket_dir" && find . -type f -printf '%P\t%s\n') > "$local_tsv" \|\| :
	fi
	else
	echo "Reusing local manifest: $local_tsv"
	if [ ! -f "$local_tsv" ]; then
	echo "Local manifest missing: $local_tsv"
	failed+=("$bucket")
	continue
	fi
	fi

	# ---- sort manifests (required for join) ----
	echo "Sorting manifests"
	LC_ALL=C sort -t $'\t' -k1,1 "$remote_tsv" -o "$remote_tsv"
	LC_ALL=C sort -t $'\t' -k1,1 "$local_tsv" -o "$local_tsv"

	# ---- join manifests ----
	if [ "$REGEN_JOINED" -eq 1 ]; then
	echo "Joining manifests"
	: > "$joined_tsv"
	join -t $'\t' -a 1 -e '' -o 1.1,1.2,2.2 "$remote_tsv" "$local_tsv" > "$joined_tsv"
	else
	echo "Reusing joined file: $joined_tsv"
	if [ ! -s "$joined_tsv" ]; then
	echo "Joined file missing or empty: $joined_tsv"
	failed+=("$bucket")
	continue
	fi
	fi

	# ---- reset outputs ----
	if [ "$REGEN_OUTPUTS" -eq 1 ]; then
	: > "$dirs_file"
	: > "$commands_file"
	else
	echo "Reusing outputs: $dirs_file and $commands_file"
	fi

	total_files=$(wc -l < "$joined_tsv" \| tr -d ' ')
	echo "Total objects to scan: $total_files"
	echo "Building copy list for $bucket (size-only)"

	count=0
	start_ts=$(date +%s)

	while IFS=$'\t' read -r rel remote_size local_size; do
	count=$((count + 1))

	if (( count % 1000 == 0 )); then
	now=$(date +%s)
	elapsed=$((now - start_ts))

	if (( elapsed > 0 )); then
	rate=$((count / elapsed))
	if (( rate > 0 )); then
	remaining=$((total_files - count))
	eta=$((remaining / rate))
	printf '\rProcessed %d / %d \| ETA %02d:%02d:%02d' \
	"$count" "$total_files" \
	$((eta/3600)) $(((eta%3600)/60)) $((eta%60))
	else
	printf '\rProcessed %d / %d \| ETA --:--:--' "$count" "$total_files"
	fi
	else
	printf '\rProcessed %d / %d \| ETA --:--:--' "$count" "$total_files"
	fi
	fi

	# Copy if missing locally or size differs
	if [ -z "${local_size:-}" ] \|\| [ "$local_size" -ne "$remote_size" ]; then
	dest="$bucket_dir/$rel"
	printf '%s\n' "$(dirname "$dest")" >> "$dirs_file"
	printf 'cp "%s/%s" "%s"\n' "$bucket" "$rel" "$dest" >> "$commands_file"
	fi
	done < "$joined_tsv"

	echo

	files_to_copy=$(wc -l < "$commands_file" \| tr -d ' ')
	echo "Files to copy: $files_to_copy"

	dirs_to_create=$(sort -u "$dirs_file" \| wc -l \| tr -d ' ')
	echo "Directories to create: $dirs_to_create"

	if [ "$files_to_copy" -eq 0 ]; then
	echo "Nothing to copy for $bucket"
	continue
	fi

	echo "Creating directories for $bucket"
	sort -u "$dirs_file" \| xargs -r mkdir -p

	echo "Copying files for $bucket (workers: $S5_NUMWORKERS)"
	if ! s5cmd --numworkers "$S5_NUMWORKERS" run "$commands_file"; then
	echo "FAILED: $bucket"
	failed+=("$bucket")
	else
	echo "Done: $bucket"
	fi
	done

	if [ "${#failed[@]}" -ne 0 ]; then
	echo "Some buckets failed:"
	printf ' - %s\n' "${failed[@]}"
	exit 2
	fi

	echo "All buckets have been downloaded to $DEST_ROOT"