Last active
February 2, 2024 13:33
-
-
Save brozkeff/76a8c537b15458a58355949c4c70da08 to your computer and use it in GitHub Desktop.
OCR PDF document using Tesseract and compress using JBIG2 lossy compression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SCRIPT_NAME="OCR and jbig2 compress document using Tesseract" | |
VERSION_NUMBER="v2024-02-02" | |
AUTHOR="Martin Brozkeff Malec - https://brozkeff.net" | |
LICENSE="MIT (https://opensource.org/license/mit/)" | |
# Shared on https://gist.github.com/brozkeff/76a8c537b15458a58355949c4c70da08 | |
echo "${SCRIPT_NAME} ${VERSION_NUMBER}" | |
echo "Author: ${AUTHOR}" | |
echo "License: ${LICENSE}" | |
# This script performs OCR on a PDF file with optional compression to jbig2 lossy and saves the result | |
# with a '-OCR' suffix. It supports multiple languages, with packages for Czech, English, and Tibetan required. | |
# Before running, ensure the necessary Tesseract OCR language packages are installed. | |
# Language packages needed: tesseract-ocr-ces (Czech), tesseract-ocr-eng (English), tesseract-ocr-script-tibt (Tibetan), | |
# and the main package tesseract-ocr. The jbig2enc package is also required for jbig2 lossy compression. | |
# Languages to process: To add or remove languages, modify the LANGS variable and REQUIRED_PKGS for Tesseract packages. | |
# Optimization level: default is 1. Options are 0-3. For lossy compression, 2-3 is required. | |
# REDO_OCR is 1 to force making new OCR. | |
LANGS="ces+eng+Tibetan" | |
REQUIRED_PKGS=("tesseract-ocr" "tesseract-ocr-ces" "tesseract-ocr-eng" "tesseract-ocr-script-tibt" "jbig2enc") | |
OPT_LEVEL="3" | |
JBIG2_LOSSY="1" | |
REDO_OCR="1" | |
echo "Script to process PDF file with Tesseract OCR and compress using jbig2." | |
MISSING_PKGS=() | |
for pkg in "${REQUIRED_PKGS[@]}"; do | |
if ! dpkg -l | grep -qw "$pkg"; then | |
MISSING_PKGS+=("$pkg") | |
fi | |
done | |
if [ ${#MISSING_PKGS[@]} -ne 0 ]; then | |
for missing_pkg in "${MISSING_PKGS[@]}"; do | |
if [ "$missing_pkg" == "jbig2enc" ]; then | |
echo "Warning: 'jbig2enc' package is missing. It is not available in default Debian/Ubuntu repositories." | |
echo "You may add the 'https://notesalexp.org/' repository to your APT sources to install 'jbig2enc'." | |
echo "Note: The absence of 'jbig2enc' might affect lossy compression capabilities." | |
else | |
echo "Missing package: $missing_pkg" | |
echo "Attempting to install missing Tesseract OCR packages. Please provide your password if prompted." | |
sudo apt-get update | |
sudo apt-get install -y "$missing_pkg" | |
fi | |
done | |
fi | |
# Check if a filename is provided | |
if [ -z "$1" ]; then | |
echo "No filename provided. Exiting..." | |
exit 1 | |
fi | |
# Define an array to hold ocrmypdf options | |
OCR_OPTIONS=(-l "$LANGS" -O"$OPT_LEVEL") | |
if [ "$JBIG2_LOSSY" -eq "1" ]; then | |
OCR_OPTIONS+=("--jbig2-lossy") | |
fi | |
if [ "$REDO_OCR" -eq "1" ]; then | |
OCR_OPTIONS+=("--redo-ocr") | |
fi | |
# Perform OCR using ocrmypdf | |
if ocrmypdf "${OCR_OPTIONS[@]}" "$1" "${1%.pdf}-OCR.pdf"; then | |
echo "OCR completed successfully. Output saved to '${1%.pdf}-OCR.pdf'." | |
else | |
ERROR_CODE=$? | |
echo "OCR failed with error code $ERROR_CODE. Please check the input file and try again." | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment