Created
December 11, 2025 03:14
-
-
Save pszemraj/ef8afb146c1a4d95405fa1506ad77dec to your computer and use it in GitHub Desktop.
gs-based PDF compressor, default is heavy compression for VLM input
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # Modern PDF compressor with LLM-optimized defaults | |
| # Requires: ghostscript (gs) | |
| VERSION="1.0.0" | |
| SCRIPT_NAME=$(basename "$0") | |
| # Defaults (LLM preset) | |
| DPI=80 | |
| JPEG_QUALITY=55 | |
| GRAYSCALE=false | |
| STRIP_METADATA=true | |
| PARALLEL_JOBS=$(nproc 2>/dev/null || echo 4) | |
| INTERACTIVE=false | |
| PRESET="llm" | |
| SUFFIX="_compressed" | |
| OUTPUT_PATH="" | |
| usage() { | |
| cat <<EOF | |
| Usage: $SCRIPT_NAME [OPTIONS] <INPUT> [OUTPUT] | |
| Compress PDFs with sensible defaults for LLM/VLM consumption. | |
| Arguments: | |
| INPUT PDF file or directory containing PDFs | |
| OUTPUT Output file (for single PDF) or directory (optional) | |
| Presets: | |
| --llm 80 DPI, JPEG q55, strip metadata (default) | |
| --ebook 150 DPI, JPEG q75, keep metadata | |
| --print 300 DPI, JPEG q90, keep metadata | |
| --screen 72 DPI, JPEG q50, strip metadata | |
| Options: | |
| -d, --dpi NUM Image resolution (default: 80) | |
| -q, --quality NUM JPEG quality 1-100 (default: 55) | |
| -g, --grayscale Convert to grayscale | |
| -m, --keep-metadata Preserve document metadata | |
| -j, --jobs NUM Parallel jobs for directories (default: $PARALLEL_JOBS) | |
| -s, --suffix STR Output suffix (default: "_compressed") | |
| -o, --output PATH Explicit output path | |
| -i, --interactive Interactive mode (legacy menu) | |
| -n, --dry-run Show what would be done | |
| -v, --verbose Verbose output | |
| -h, --help Show this help | |
| --version Show version | |
| Examples: | |
| $SCRIPT_NAME document.pdf # LLM preset, outputs document_compressed.pdf | |
| $SCRIPT_NAME --ebook book.pdf book_small.pdf # ebook preset, explicit output | |
| $SCRIPT_NAME ./papers/ # Compress all PDFs in directory | |
| $SCRIPT_NAME -d 100 -q 70 scan.pdf # Custom settings | |
| EOF | |
| exit 0 | |
| } | |
| die() { printf '\033[0;31mERROR:\033[0m %s\n' "$1" >&2; exit 1; } | |
| warn() { printf '\033[0;33mWARN:\033[0m %s\n' "$1" >&2; } | |
| info() { printf '\033[0;36mINFO:\033[0m %s\n' "$1"; } | |
| success() { printf '\033[0;32m✓\033[0m %s\n' "$1"; } | |
| check_deps() { | |
| command -v gs &>/dev/null || die "ghostscript (gs) not found. Install with: apt install ghostscript" | |
| } | |
| apply_preset() { | |
| case "$1" in | |
| llm) | |
| DPI=80; JPEG_QUALITY=55; STRIP_METADATA=true ;; | |
| ebook) | |
| DPI=150; JPEG_QUALITY=75; STRIP_METADATA=false ;; | |
| print) | |
| DPI=300; JPEG_QUALITY=90; STRIP_METADATA=false ;; | |
| screen) | |
| DPI=72; JPEG_QUALITY=50; STRIP_METADATA=true ;; | |
| *) | |
| die "Unknown preset: $1" ;; | |
| esac | |
| } | |
| bytes_saved() { | |
| local input="$1" output="$2" | |
| [[ -f "$output" ]] || return 1 | |
| local orgsize optsize percent saved | |
| orgsize=$(stat -c "%s" "$input" 2>/dev/null || stat -f "%z" "$input") | |
| optsize=$(stat -c "%s" "$output" 2>/dev/null || stat -f "%z" "$output") | |
| if [[ "$optsize" -eq 0 ]]; then | |
| rm -f "$output" | |
| warn "Output was 0 bytes, deleted." | |
| return 1 | |
| fi | |
| percent=$((optsize * 100 / orgsize)) | |
| saved=$((orgsize - optsize)) | |
| local org_human opt_human | |
| org_human=$(numfmt --to=iec "$orgsize" 2>/dev/null || echo "${orgsize}B") | |
| opt_human=$(numfmt --to=iec "$optsize" 2>/dev/null || echo "${optsize}B") | |
| if [[ "$percent" -gt 100 ]]; then | |
| printf '\033[0;31m%s → %s (%d%%, larger!)\033[0m\n' "$org_human" "$opt_human" "$percent" | |
| elif [[ "$percent" -gt 80 ]]; then | |
| printf '\033[1;33m%s → %s (%d%%)\033[0m\n' "$org_human" "$opt_human" "$percent" | |
| else | |
| printf '\033[1;32m%s → %s (%d%%)\033[0m\n' "$org_human" "$opt_human" "$percent" | |
| fi | |
| } | |
| compress_pdf() { | |
| local input="$1" output="$2" | |
| local gs_flags=( | |
| -dBATCH -dNOPAUSE -dSAFER -dQUIET | |
| -sDEVICE=pdfwrite | |
| -dCompatibilityLevel=1.5 | |
| -dAutoRotatePages=/None | |
| -dDownsampleColorImages=true | |
| -dDownsampleGrayImages=true | |
| -dDownsampleMonoImages=true | |
| -dColorImageDownsampleType=/Bicubic | |
| -dGrayImageDownsampleType=/Bicubic | |
| -dMonoImageDownsampleType=/Subsample | |
| -dColorImageResolution="$DPI" | |
| -dGrayImageResolution="$DPI" | |
| -dMonoImageResolution="$DPI" | |
| -dColorImageDownsampleThreshold=1.0 | |
| -dGrayImageDownsampleThreshold=1.0 | |
| -dJPEGQ="$JPEG_QUALITY" | |
| -dEmbedAllFonts=true | |
| -dSubsetFonts=true | |
| ) | |
| if [[ "$GRAYSCALE" == true ]]; then | |
| gs_flags+=( | |
| -sColorConversionStrategy=Gray | |
| -sColorConversionStrategyForImages=Gray | |
| -dProcessColorModel=/DeviceGray | |
| ) | |
| fi | |
| if [[ "$STRIP_METADATA" == true ]]; then | |
| gs "${gs_flags[@]}" \ | |
| -sOutputFile="$output" \ | |
| -c "<</Creator() /Producer() /Title() /Author() /Subject() /Keywords() /CreationDate() /ModDate()>> setdistillerparams" \ | |
| -f "$input" | |
| else | |
| gs "${gs_flags[@]}" -sOutputFile="$output" -f "$input" | |
| fi | |
| } | |
| process_single() { | |
| local input="$1" output="$2" | |
| [[ -f "$input" ]] || die "File not found: $input" | |
| [[ "$input" == *.pdf || "$input" == *.PDF ]] || die "Not a PDF: $input" | |
| # Default output: same directory, with suffix | |
| if [[ -z "$output" ]]; then | |
| local dir base ext | |
| dir=$(dirname "$input") | |
| base=$(basename "$input") | |
| ext="${base##*.}" | |
| base="${base%.*}" | |
| output="${dir}/${base}${SUFFIX}.${ext}" | |
| fi | |
| # Don't overwrite input | |
| if [[ "$(realpath "$input")" == "$(realpath "$output" 2>/dev/null || echo "$output")" ]]; then | |
| die "Output would overwrite input. Use -o to specify different output." | |
| fi | |
| mkdir -p "$(dirname "$output")" | |
| printf '\033[1;37m%s\033[0m → ' "$(basename "$input")" | |
| if [[ "${DRY_RUN:-false}" == true ]]; then | |
| echo "[dry-run] would compress to $output" | |
| return 0 | |
| fi | |
| if compress_pdf "$input" "$output"; then | |
| bytes_saved "$input" "$output" | |
| else | |
| warn "Failed to compress: $input" | |
| return 1 | |
| fi | |
| } | |
| process_directory() { | |
| local input_dir="${1%/}" | |
| local output_dir="${2:-${input_dir}${SUFFIX}}" | |
| [[ -d "$input_dir" ]] || die "Directory not found: $input_dir" | |
| local -a files | |
| mapfile -t files < <(find "$input_dir" -type f -iname '*.pdf' | sort) | |
| [[ ${#files[@]} -gt 0 ]] || die "No PDFs found in $input_dir" | |
| info "Found ${#files[@]} PDFs in $input_dir" | |
| info "Output directory: $output_dir" | |
| info "Using $PARALLEL_JOBS parallel jobs" | |
| mkdir -p "$output_dir" | |
| export -f compress_pdf bytes_saved warn | |
| export DPI JPEG_QUALITY GRAYSCALE STRIP_METADATA SUFFIX DRY_RUN | |
| local failed=0 | |
| for input in "${files[@]}"; do | |
| local relpath="${input#$input_dir/}" | |
| local output="$output_dir/$relpath" | |
| mkdir -p "$(dirname "$output")" | |
| printf '\033[1;37m%-50s\033[0m ' "$(basename "$input")" | |
| if [[ "${DRY_RUN:-false}" == true ]]; then | |
| echo "[dry-run]" | |
| continue | |
| fi | |
| if compress_pdf "$input" "$output"; then | |
| bytes_saved "$input" "$output" | |
| else | |
| warn "Failed" | |
| ((failed++)) || true | |
| fi | |
| done | |
| if [[ $failed -eq 0 ]]; then | |
| success "All ${#files[@]} PDFs compressed successfully" | |
| else | |
| warn "$failed/${#files[@]} PDFs failed" | |
| fi | |
| } | |
| interactive_mode() { | |
| local input="$1" | |
| echo "" | |
| echo "Select preset:" | |
| echo " 1) LLM (80 DPI, aggressive) [default]" | |
| echo " 2) Ebook (150 DPI)" | |
| echo " 3) Print (300 DPI)" | |
| echo " 4) Screen (72 DPI)" | |
| echo "" | |
| read -rp "Choice [1]: " choice | |
| case "${choice:-1}" in | |
| 1) apply_preset llm ;; | |
| 2) apply_preset ebook ;; | |
| 3) apply_preset print ;; | |
| 4) apply_preset screen ;; | |
| *) apply_preset llm ;; | |
| esac | |
| echo "" | |
| read -rp "Convert to grayscale? [y/N]: " gray | |
| [[ "$gray" =~ ^[Yy] ]] && GRAYSCALE=true | |
| echo "" | |
| read -rp "Custom DPI (or Enter for $DPI): " custom_dpi | |
| [[ -n "$custom_dpi" ]] && DPI="$custom_dpi" | |
| echo "" | |
| info "Settings: DPI=$DPI, Quality=$JPEG_QUALITY, Grayscale=$GRAYSCALE, StripMeta=$STRIP_METADATA" | |
| } | |
| main() { | |
| check_deps | |
| local -a positional=() | |
| DRY_RUN=false | |
| VERBOSE=false | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --llm|--ebook|--print|--screen) | |
| apply_preset "${1#--}"; shift ;; | |
| -d|--dpi) | |
| DPI="$2"; shift 2 ;; | |
| -q|--quality) | |
| JPEG_QUALITY="$2"; shift 2 ;; | |
| -g|--grayscale) | |
| GRAYSCALE=true; shift ;; | |
| -m|--keep-metadata) | |
| STRIP_METADATA=false; shift ;; | |
| -j|--jobs) | |
| PARALLEL_JOBS="$2"; shift 2 ;; | |
| -s|--suffix) | |
| SUFFIX="$2"; shift 2 ;; | |
| -o|--output) | |
| OUTPUT_PATH="$2"; shift 2 ;; | |
| -i|--interactive) | |
| INTERACTIVE=true; shift ;; | |
| -n|--dry-run) | |
| DRY_RUN=true; shift ;; | |
| -v|--verbose) | |
| VERBOSE=true; shift ;; | |
| -h|--help) | |
| usage ;; | |
| --version) | |
| echo "$SCRIPT_NAME $VERSION"; exit 0 ;; | |
| --) | |
| shift; positional+=("$@"); break ;; | |
| -*) | |
| die "Unknown option: $1" ;; | |
| *) | |
| positional+=("$1"); shift ;; | |
| esac | |
| done | |
| [[ ${#positional[@]} -ge 1 ]] || { usage; } | |
| local input="${positional[0]}" | |
| local output="${OUTPUT_PATH:-${positional[1]:-}}" | |
| [[ "$INTERACTIVE" == true ]] && interactive_mode "$input" | |
| [[ "$VERBOSE" == true ]] && info "DPI=$DPI Q=$JPEG_QUALITY Gray=$GRAYSCALE StripMeta=$STRIP_METADATA" | |
| if [[ -d "$input" ]]; then | |
| process_directory "$input" "$output" | |
| elif [[ -f "$input" ]]; then | |
| process_single "$input" "$output" | |
| else | |
| die "Not found: $input" | |
| fi | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment