Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created December 11, 2025 03:14
Show Gist options
  • Select an option

  • Save pszemraj/ef8afb146c1a4d95405fa1506ad77dec to your computer and use it in GitHub Desktop.

Select an option

Save pszemraj/ef8afb146c1a4d95405fa1506ad77dec to your computer and use it in GitHub Desktop.
gs-based PDF compressor, default is heavy compression for VLM input
#!/usr/bin/env bash
set -euo pipefail
# Modern PDF compressor with LLM-optimized defaults
# Requires: ghostscript (gs)
VERSION="1.0.0"
SCRIPT_NAME=$(basename "$0")
# Defaults (LLM preset)
DPI=80
JPEG_QUALITY=55
GRAYSCALE=false
STRIP_METADATA=true
PARALLEL_JOBS=$(nproc 2>/dev/null || echo 4)
INTERACTIVE=false
PRESET="llm"
SUFFIX="_compressed"
OUTPUT_PATH=""
usage() {
cat <<EOF
Usage: $SCRIPT_NAME [OPTIONS] <INPUT> [OUTPUT]
Compress PDFs with sensible defaults for LLM/VLM consumption.
Arguments:
INPUT PDF file or directory containing PDFs
OUTPUT Output file (for single PDF) or directory (optional)
Presets:
--llm 80 DPI, JPEG q55, strip metadata (default)
--ebook 150 DPI, JPEG q75, keep metadata
--print 300 DPI, JPEG q90, keep metadata
--screen 72 DPI, JPEG q50, strip metadata
Options:
-d, --dpi NUM Image resolution (default: 80)
-q, --quality NUM JPEG quality 1-100 (default: 55)
-g, --grayscale Convert to grayscale
-m, --keep-metadata Preserve document metadata
-j, --jobs NUM Parallel jobs for directories (default: $PARALLEL_JOBS)
-s, --suffix STR Output suffix (default: "_compressed")
-o, --output PATH Explicit output path
-i, --interactive Interactive mode (legacy menu)
-n, --dry-run Show what would be done
-v, --verbose Verbose output
-h, --help Show this help
--version Show version
Examples:
$SCRIPT_NAME document.pdf # LLM preset, outputs document_compressed.pdf
$SCRIPT_NAME --ebook book.pdf book_small.pdf # ebook preset, explicit output
$SCRIPT_NAME ./papers/ # Compress all PDFs in directory
$SCRIPT_NAME -d 100 -q 70 scan.pdf # Custom settings
EOF
exit 0
}
die() { printf '\033[0;31mERROR:\033[0m %s\n' "$1" >&2; exit 1; }
warn() { printf '\033[0;33mWARN:\033[0m %s\n' "$1" >&2; }
info() { printf '\033[0;36mINFO:\033[0m %s\n' "$1"; }
success() { printf '\033[0;32m✓\033[0m %s\n' "$1"; }
check_deps() {
command -v gs &>/dev/null || die "ghostscript (gs) not found. Install with: apt install ghostscript"
}
apply_preset() {
case "$1" in
llm)
DPI=80; JPEG_QUALITY=55; STRIP_METADATA=true ;;
ebook)
DPI=150; JPEG_QUALITY=75; STRIP_METADATA=false ;;
print)
DPI=300; JPEG_QUALITY=90; STRIP_METADATA=false ;;
screen)
DPI=72; JPEG_QUALITY=50; STRIP_METADATA=true ;;
*)
die "Unknown preset: $1" ;;
esac
}
bytes_saved() {
local input="$1" output="$2"
[[ -f "$output" ]] || return 1
local orgsize optsize percent saved
orgsize=$(stat -c "%s" "$input" 2>/dev/null || stat -f "%z" "$input")
optsize=$(stat -c "%s" "$output" 2>/dev/null || stat -f "%z" "$output")
if [[ "$optsize" -eq 0 ]]; then
rm -f "$output"
warn "Output was 0 bytes, deleted."
return 1
fi
percent=$((optsize * 100 / orgsize))
saved=$((orgsize - optsize))
local org_human opt_human
org_human=$(numfmt --to=iec "$orgsize" 2>/dev/null || echo "${orgsize}B")
opt_human=$(numfmt --to=iec "$optsize" 2>/dev/null || echo "${optsize}B")
if [[ "$percent" -gt 100 ]]; then
printf '\033[0;31m%s → %s (%d%%, larger!)\033[0m\n' "$org_human" "$opt_human" "$percent"
elif [[ "$percent" -gt 80 ]]; then
printf '\033[1;33m%s → %s (%d%%)\033[0m\n' "$org_human" "$opt_human" "$percent"
else
printf '\033[1;32m%s → %s (%d%%)\033[0m\n' "$org_human" "$opt_human" "$percent"
fi
}
compress_pdf() {
local input="$1" output="$2"
local gs_flags=(
-dBATCH -dNOPAUSE -dSAFER -dQUIET
-sDEVICE=pdfwrite
-dCompatibilityLevel=1.5
-dAutoRotatePages=/None
-dDownsampleColorImages=true
-dDownsampleGrayImages=true
-dDownsampleMonoImages=true
-dColorImageDownsampleType=/Bicubic
-dGrayImageDownsampleType=/Bicubic
-dMonoImageDownsampleType=/Subsample
-dColorImageResolution="$DPI"
-dGrayImageResolution="$DPI"
-dMonoImageResolution="$DPI"
-dColorImageDownsampleThreshold=1.0
-dGrayImageDownsampleThreshold=1.0
-dJPEGQ="$JPEG_QUALITY"
-dEmbedAllFonts=true
-dSubsetFonts=true
)
if [[ "$GRAYSCALE" == true ]]; then
gs_flags+=(
-sColorConversionStrategy=Gray
-sColorConversionStrategyForImages=Gray
-dProcessColorModel=/DeviceGray
)
fi
if [[ "$STRIP_METADATA" == true ]]; then
gs "${gs_flags[@]}" \
-sOutputFile="$output" \
-c "<</Creator() /Producer() /Title() /Author() /Subject() /Keywords() /CreationDate() /ModDate()>> setdistillerparams" \
-f "$input"
else
gs "${gs_flags[@]}" -sOutputFile="$output" -f "$input"
fi
}
process_single() {
local input="$1" output="$2"
[[ -f "$input" ]] || die "File not found: $input"
[[ "$input" == *.pdf || "$input" == *.PDF ]] || die "Not a PDF: $input"
# Default output: same directory, with suffix
if [[ -z "$output" ]]; then
local dir base ext
dir=$(dirname "$input")
base=$(basename "$input")
ext="${base##*.}"
base="${base%.*}"
output="${dir}/${base}${SUFFIX}.${ext}"
fi
# Don't overwrite input
if [[ "$(realpath "$input")" == "$(realpath "$output" 2>/dev/null || echo "$output")" ]]; then
die "Output would overwrite input. Use -o to specify different output."
fi
mkdir -p "$(dirname "$output")"
printf '\033[1;37m%s\033[0m → ' "$(basename "$input")"
if [[ "${DRY_RUN:-false}" == true ]]; then
echo "[dry-run] would compress to $output"
return 0
fi
if compress_pdf "$input" "$output"; then
bytes_saved "$input" "$output"
else
warn "Failed to compress: $input"
return 1
fi
}
process_directory() {
local input_dir="${1%/}"
local output_dir="${2:-${input_dir}${SUFFIX}}"
[[ -d "$input_dir" ]] || die "Directory not found: $input_dir"
local -a files
mapfile -t files < <(find "$input_dir" -type f -iname '*.pdf' | sort)
[[ ${#files[@]} -gt 0 ]] || die "No PDFs found in $input_dir"
info "Found ${#files[@]} PDFs in $input_dir"
info "Output directory: $output_dir"
info "Using $PARALLEL_JOBS parallel jobs"
mkdir -p "$output_dir"
export -f compress_pdf bytes_saved warn
export DPI JPEG_QUALITY GRAYSCALE STRIP_METADATA SUFFIX DRY_RUN
local failed=0
for input in "${files[@]}"; do
local relpath="${input#$input_dir/}"
local output="$output_dir/$relpath"
mkdir -p "$(dirname "$output")"
printf '\033[1;37m%-50s\033[0m ' "$(basename "$input")"
if [[ "${DRY_RUN:-false}" == true ]]; then
echo "[dry-run]"
continue
fi
if compress_pdf "$input" "$output"; then
bytes_saved "$input" "$output"
else
warn "Failed"
((failed++)) || true
fi
done
if [[ $failed -eq 0 ]]; then
success "All ${#files[@]} PDFs compressed successfully"
else
warn "$failed/${#files[@]} PDFs failed"
fi
}
interactive_mode() {
local input="$1"
echo ""
echo "Select preset:"
echo " 1) LLM (80 DPI, aggressive) [default]"
echo " 2) Ebook (150 DPI)"
echo " 3) Print (300 DPI)"
echo " 4) Screen (72 DPI)"
echo ""
read -rp "Choice [1]: " choice
case "${choice:-1}" in
1) apply_preset llm ;;
2) apply_preset ebook ;;
3) apply_preset print ;;
4) apply_preset screen ;;
*) apply_preset llm ;;
esac
echo ""
read -rp "Convert to grayscale? [y/N]: " gray
[[ "$gray" =~ ^[Yy] ]] && GRAYSCALE=true
echo ""
read -rp "Custom DPI (or Enter for $DPI): " custom_dpi
[[ -n "$custom_dpi" ]] && DPI="$custom_dpi"
echo ""
info "Settings: DPI=$DPI, Quality=$JPEG_QUALITY, Grayscale=$GRAYSCALE, StripMeta=$STRIP_METADATA"
}
main() {
check_deps
local -a positional=()
DRY_RUN=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case "$1" in
--llm|--ebook|--print|--screen)
apply_preset "${1#--}"; shift ;;
-d|--dpi)
DPI="$2"; shift 2 ;;
-q|--quality)
JPEG_QUALITY="$2"; shift 2 ;;
-g|--grayscale)
GRAYSCALE=true; shift ;;
-m|--keep-metadata)
STRIP_METADATA=false; shift ;;
-j|--jobs)
PARALLEL_JOBS="$2"; shift 2 ;;
-s|--suffix)
SUFFIX="$2"; shift 2 ;;
-o|--output)
OUTPUT_PATH="$2"; shift 2 ;;
-i|--interactive)
INTERACTIVE=true; shift ;;
-n|--dry-run)
DRY_RUN=true; shift ;;
-v|--verbose)
VERBOSE=true; shift ;;
-h|--help)
usage ;;
--version)
echo "$SCRIPT_NAME $VERSION"; exit 0 ;;
--)
shift; positional+=("$@"); break ;;
-*)
die "Unknown option: $1" ;;
*)
positional+=("$1"); shift ;;
esac
done
[[ ${#positional[@]} -ge 1 ]] || { usage; }
local input="${positional[0]}"
local output="${OUTPUT_PATH:-${positional[1]:-}}"
[[ "$INTERACTIVE" == true ]] && interactive_mode "$input"
[[ "$VERBOSE" == true ]] && info "DPI=$DPI Q=$JPEG_QUALITY Gray=$GRAYSCALE StripMeta=$STRIP_METADATA"
if [[ -d "$input" ]]; then
process_directory "$input" "$output"
elif [[ -f "$input" ]]; then
process_single "$input" "$output"
else
die "Not found: $input"
fi
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment