Skip to content

Instantly share code, notes, and snippets.

@rcky844
Last active March 11, 2025 16:56
Show Gist options
  • Save rcky844/a3d37d802072a28c7c00d5b77522b945 to your computer and use it in GitHub Desktop.
Save rcky844/a3d37d802072a28c7c00d5b77522b945 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# pdfcleanse 2025.03.12
# By Ricky Cheung <[email protected]>
# Variables:
# - COMPRESS_TYPE: bw, posterize, jpeg, webp
# - COMPRESS_BW_BRIGHTNESS (int - default: 15X1)
# - COMPRESS_POSTERIZE_LEVEL (int - default: 2)
# - EXTRACT_METHOD: ppm, images, magick (default: ppm)
# - SAVE_METHOD: gs, qpdf (default: gs)
# - REMOVE_WATERMARK (boolean - default: false)
# - ROTATION (int - default: 0)
# - DRY_RUN (boolean - default: false)
COMPRESS_BW_BRIGHTNESS=${COMPRESS_BW_BRIGHTNESS:=15X1}
COMPRESS_POSTERIZE_LEVEL=${COMPRESS_POSTERIZE_LEVEL:=2}
EXTRACT_METHOD=${EXTRACT_METHOD:=ppm}
SAVE_METHOD=${SAVE_METHOD:=gs}
WORK_DIR="/tmp/pdfcleanse"
WORK_FILE="$WORK_DIR/temp.pdf"
[[ -z "$@" ]] && echo "Usage: pdfcleanse input.pdf" && exit
compress_pdf() {
if [[ "$COMPRESS_TYPE" == "bw" ]]; then
magick $WORK_DIR/*.$FILE_EXT -brightness-contrast $COMPRESS_BW_BRIGHTNESS -compress Group4 -type bilevel TIFF:- | magick - $WORK_FILE
elif [[ "$COMPRESS_TYPE" == "posterize" ]]; then
magick $WORK_DIR/*.$FILE_EXT -posterize $COMPRESS_POSTERIZE_LEVEL $WORK_FILE
elif [[ "$COMPRESS_TYPE" == "jpeg" ]]; then
magick mogrify -format JPEG -quality 80 $WORK_DIR/*.$FILE_EXT
magick $WORK_DIR/*.JPEG $WORK_FILE
elif [[ "$COMPRESS_TYPE" == "webp" ]]; then
for file in $WORK_DIR/*.$FILE_EXT; do cwebp -q 100 -lossless "$file" -o "${file%.$FILE_EXT}.webp"; done
magick $WORK_DIR/*.webp $WORK_FILE
else
echo "[!] Unknown method for compression!"
exit
fi
}
continue_prompt() {
read -p "When complete, press enter to continue..." 0</dev/tty
}
while read -r i; do
echo "[!] Processing PDF: $i"
rm -rf $WORK_DIR
mkdir -p $WORK_DIR
if [[ "$REMOVE_WATERMARK" == "true" ]]; then
# Uncompress PDF
echo "[*] Uncompressing PDF file"
qpdf --qdf --object-streams=disable "$i" $WORK_FILE
# Known watermark
echo "[*] Removing known watermarks"
sed -i "s/66F4591A8A6653776B618FCE700F89BD//g" $WORK_FILE # dse.pp text
sed -i "s/\ http:\/\/dsepp.com//g" $WORK_FILE # dse.pp link
sed -i "s/\(Provided by dse.life\|Provided \|by \|dse.life\)//g" $WORK_FILE # dse.life watermark
sed -i "s/\x2E\x31\x36\x31\x20\x30\x20\x6C//g" $WORK_FILE # dse.life watermark underline
sed -i 'H;1h;$!d;x; s/\x77\n\x30\x20\x2D\x31\x36\x2E\x38\x35\x20\x6D//g' $WORK_FILE # dse.life watermark underline
else
cp "$i" $WORK_FILE
fi
# Convert to black and white
if [[ ! -z "$COMPRESS_TYPE" ]]; then
echo "[*] Converting to black and white"
echo "- Converting pages to image files"
(
cd $WORK_DIR
if [[ "$EXTRACT_METHOD" == "ppm" ]]; then
pdftoppm temp.pdf pages -r 300 -scale-to 6000
elif [[ "$EXTRACT_METHOD" == "images" ]]; then
pdfimages temp.pdf pages
elif [[ "$EXTRACT_METHOD" == "magick" ]]; then
magick -density 300 temp.pdf pages-%d.ppm
else
echo "[!] Unknown method for extraction!"
exit
fi
)
echo "Rearrange the files in $WORK_DIR if necessary."
continue_prompt
echo "- Compressing into PDF file"
FILE_EXT="ppm"
compgen -G "$WORK_DIR/*.ppm" > /dev/null || FILE_EXT="pbm"
compress_pdf
while true; do
read -p "Is the output PDF file ($WORK_FILE) of satisfactory quality? [y/n] " yn 0</dev/tty
case $yn in
[Yy]* )
break
;;
[Nn]* )
echo "Please modify the ppm files in $WORK_DIR accordingly."
continue_prompt
compress_pdf
;;
* )
echo "Please answer yes or no."
;;
esac
done
fi
# Extra questions
if [[ "$ROTATION" == true ]]; then
read -p "Please input the value (in degrees) you want all the PDF pages to rotate: " deg 0</dev/tty
qpdf --rotate=+$deg --replace-input $WORK_FILE
fi
if [[ "$DRY_RUN" == "true" ]]; then
echo "[*] Holding save (dry run is enabled)"
continue_prompt
fi
# Save
echo "[*] Saving file"
if [[ "$SAVE_METHOD" == "gs" ]]; then
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/default -dNOPAUSE -dQUIET -dBATCH -sOutputFile=- -f $WORK_FILE -sPAPERSIZE=a4 -c "[ /Title () /DOCINFO pdfmark" | sponge $WORK_FILE
pdftocairo -r 300 -pdf $WORK_FILE "$i"
elif [[ "$SAVE_METHOD" == "qpdf" ]]; then
qpdf $WORK_FILE "$i"
else
echo "[!] Unknown method for saving!"
exit
fi
echo "[*] Cleaning up"
rm -rf $WORK_DIR
done < <(find "$@" -type f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment