Last active
March 11, 2025 16:56
-
-
Save rcky844/a3d37d802072a28c7c00d5b77522b945 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# pdfcleanse 2025.03.12 | |
# By Ricky Cheung <[email protected]> | |
# Variables: | |
# - COMPRESS_TYPE: bw, posterize, jpeg, webp | |
# - COMPRESS_BW_BRIGHTNESS (int - default: 15X1) | |
# - COMPRESS_POSTERIZE_LEVEL (int - default: 2) | |
# - EXTRACT_METHOD: ppm, images, magick (default: ppm) | |
# - SAVE_METHOD: gs, qpdf (default: gs) | |
# - REMOVE_WATERMARK (boolean - default: false) | |
# - ROTATION (int - default: 0) | |
# - DRY_RUN (boolean - default: false) | |
COMPRESS_BW_BRIGHTNESS=${COMPRESS_BW_BRIGHTNESS:=15X1} | |
COMPRESS_POSTERIZE_LEVEL=${COMPRESS_POSTERIZE_LEVEL:=2} | |
EXTRACT_METHOD=${EXTRACT_METHOD:=ppm} | |
SAVE_METHOD=${SAVE_METHOD:=gs} | |
WORK_DIR="/tmp/pdfcleanse" | |
WORK_FILE="$WORK_DIR/temp.pdf" | |
[[ -z "$@" ]] && echo "Usage: pdfcleanse input.pdf" && exit | |
compress_pdf() { | |
if [[ "$COMPRESS_TYPE" == "bw" ]]; then | |
magick $WORK_DIR/*.$FILE_EXT -brightness-contrast $COMPRESS_BW_BRIGHTNESS -compress Group4 -type bilevel TIFF:- | magick - $WORK_FILE | |
elif [[ "$COMPRESS_TYPE" == "posterize" ]]; then | |
magick $WORK_DIR/*.$FILE_EXT -posterize $COMPRESS_POSTERIZE_LEVEL $WORK_FILE | |
elif [[ "$COMPRESS_TYPE" == "jpeg" ]]; then | |
magick mogrify -format JPEG -quality 80 $WORK_DIR/*.$FILE_EXT | |
magick $WORK_DIR/*.JPEG $WORK_FILE | |
elif [[ "$COMPRESS_TYPE" == "webp" ]]; then | |
for file in $WORK_DIR/*.$FILE_EXT; do cwebp -q 100 -lossless "$file" -o "${file%.$FILE_EXT}.webp"; done | |
magick $WORK_DIR/*.webp $WORK_FILE | |
else | |
echo "[!] Unknown method for compression!" | |
exit | |
fi | |
} | |
continue_prompt() { | |
read -p "When complete, press enter to continue..." 0</dev/tty | |
} | |
while read -r i; do | |
echo "[!] Processing PDF: $i" | |
rm -rf $WORK_DIR | |
mkdir -p $WORK_DIR | |
if [[ "$REMOVE_WATERMARK" == "true" ]]; then | |
# Uncompress PDF | |
echo "[*] Uncompressing PDF file" | |
qpdf --qdf --object-streams=disable "$i" $WORK_FILE | |
# Known watermark | |
echo "[*] Removing known watermarks" | |
sed -i "s/66F4591A8A6653776B618FCE700F89BD//g" $WORK_FILE # dse.pp text | |
sed -i "s/\ http:\/\/dsepp.com//g" $WORK_FILE # dse.pp link | |
sed -i "s/\(Provided by dse.life\|Provided \|by \|dse.life\)//g" $WORK_FILE # dse.life watermark | |
sed -i "s/\x2E\x31\x36\x31\x20\x30\x20\x6C//g" $WORK_FILE # dse.life watermark underline | |
sed -i 'H;1h;$!d;x; s/\x77\n\x30\x20\x2D\x31\x36\x2E\x38\x35\x20\x6D//g' $WORK_FILE # dse.life watermark underline | |
else | |
cp "$i" $WORK_FILE | |
fi | |
# Convert to black and white | |
if [[ ! -z "$COMPRESS_TYPE" ]]; then | |
echo "[*] Converting to black and white" | |
echo "- Converting pages to image files" | |
( | |
cd $WORK_DIR | |
if [[ "$EXTRACT_METHOD" == "ppm" ]]; then | |
pdftoppm temp.pdf pages -r 300 -scale-to 6000 | |
elif [[ "$EXTRACT_METHOD" == "images" ]]; then | |
pdfimages temp.pdf pages | |
elif [[ "$EXTRACT_METHOD" == "magick" ]]; then | |
magick -density 300 temp.pdf pages-%d.ppm | |
else | |
echo "[!] Unknown method for extraction!" | |
exit | |
fi | |
) | |
echo "Rearrange the files in $WORK_DIR if necessary." | |
continue_prompt | |
echo "- Compressing into PDF file" | |
FILE_EXT="ppm" | |
compgen -G "$WORK_DIR/*.ppm" > /dev/null || FILE_EXT="pbm" | |
compress_pdf | |
while true; do | |
read -p "Is the output PDF file ($WORK_FILE) of satisfactory quality? [y/n] " yn 0</dev/tty | |
case $yn in | |
[Yy]* ) | |
break | |
;; | |
[Nn]* ) | |
echo "Please modify the ppm files in $WORK_DIR accordingly." | |
continue_prompt | |
compress_pdf | |
;; | |
* ) | |
echo "Please answer yes or no." | |
;; | |
esac | |
done | |
fi | |
# Extra questions | |
if [[ "$ROTATION" == true ]]; then | |
read -p "Please input the value (in degrees) you want all the PDF pages to rotate: " deg 0</dev/tty | |
qpdf --rotate=+$deg --replace-input $WORK_FILE | |
fi | |
if [[ "$DRY_RUN" == "true" ]]; then | |
echo "[*] Holding save (dry run is enabled)" | |
continue_prompt | |
fi | |
# Save | |
echo "[*] Saving file" | |
if [[ "$SAVE_METHOD" == "gs" ]]; then | |
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/default -dNOPAUSE -dQUIET -dBATCH -sOutputFile=- -f $WORK_FILE -sPAPERSIZE=a4 -c "[ /Title () /DOCINFO pdfmark" | sponge $WORK_FILE | |
pdftocairo -r 300 -pdf $WORK_FILE "$i" | |
elif [[ "$SAVE_METHOD" == "qpdf" ]]; then | |
qpdf $WORK_FILE "$i" | |
else | |
echo "[!] Unknown method for saving!" | |
exit | |
fi | |
echo "[*] Cleaning up" | |
rm -rf $WORK_DIR | |
done < <(find "$@" -type f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment