brozkeff · February 2, 2024 13:33
diff --git a/pdf-ocr-compress.sh b/pdf-ocr-compress.sh
 #!/bin/bash
 SCRIPT_NAME="OCR and jbig2 compress document using Tesseract"
 VERSION_NUMBER="v2024-02-02"
 AUTHOR="Martin Brozkeff Malec - https://brozkeff.net"
 LICENSE="MIT (https://opensource.org/license/mit/)"
 # Shared on https://gist.github.com/brozkeff/76a8c537b15458a58355949c4c70da08

 echo "${SCRIPT_NAME} ${VERSION_NUMBER}"
 echo "Author: ${AUTHOR}"
 echo "License: ${LICENSE}"

 # This script performs OCR on a PDF file with optional compression to jbig2 lossy and saves the result
 # with a '-OCR' suffix. It supports multiple languages, with packages for Czech, English, and Tibetan required.
 # Before running, ensure the necessary Tesseract OCR language packages are installed.

 # Language packages needed: tesseract-ocr-ces (Czech), tesseract-ocr-eng (English), tesseract-ocr-script-tibt (Tibetan),
 # and the main package tesseract-ocr. The jbig2enc package is also required for jbig2 lossy compression.

 # Languages to process: To add or remove languages, modify the LANGS variable and REQUIRED_PKGS for Tesseract packages.
 # Optimization level: default is 1. Options are 0-3. For lossy compression, 2-3 is required.
 # REDO_OCR is 1 to force making new OCR.
 LANGS="ces+eng+Tibetan"
 REQUIRED_PKGS=("tesseract-ocr" "tesseract-ocr-ces" "tesseract-ocr-eng" "tesseract-ocr-script-tibt" "jbig2enc")
 OPT_LEVEL="3"
 JBIG2_LOSSY="1"
 REDO_OCR="1"  

 echo "Script to process PDF file with Tesseract OCR and compress using jbig2."

 MISSING_PKGS=()

 for pkg in "${REQUIRED_PKGS[@]}"; do
  if ! dpkg -l | grep -qw "$pkg"; then
    MISSING_PKGS+=("$pkg")
  fi
 done

 if [ ${#MISSING_PKGS[@]} -ne 0 ]; then
  for missing_pkg in "${MISSING_PKGS[@]}"; do
    if [ "$missing_pkg" == "jbig2enc" ]; then
      echo "Warning: 'jbig2enc' package is missing. It is not available in default Debian/Ubuntu repositories."
      echo "You may add the 'https://notesalexp.org/' repository to your APT sources to install 'jbig2enc'."
      echo "Note: The absence of 'jbig2enc' might affect lossy compression capabilities."
    else
      echo "Missing package: $missing_pkg"
      echo "Attempting to install missing Tesseract OCR packages. Please provide your password if prompted."
      sudo apt-get update
      sudo apt-get install -y "$missing_pkg"
    fi
  done
 fi

 # Check if a filename is provided
 if [ -z "$1" ]; then
  echo "No filename provided. Exiting..."
  exit 1
 fi

 # Define an array to hold ocrmypdf options
 OCR_OPTIONS=(-l "$LANGS" -O"$OPT_LEVEL")
 if [ "$JBIG2_LOSSY" -eq "1" ]; then
  OCR_OPTIONS+=("--jbig2-lossy")
 fi
 if [ "$REDO_OCR" -eq "1" ]; then
  OCR_OPTIONS+=("--redo-ocr")
 fi

 # Perform OCR using ocrmypdf
 if ocrmypdf "${OCR_OPTIONS[@]}" "$1" "${1%.pdf}-OCR.pdf"; then
   echo "OCR completed successfully. Output saved to '${1%.pdf}-OCR.pdf'."
 else
  ERROR_CODE=$?
  echo "OCR failed with error code $ERROR_CODE. Please check the input file and try again."
 fi
	#!/bin/bash
	SCRIPT_NAME="OCR and jbig2 compress document using Tesseract"
	VERSION_NUMBER="v2024-02-02"
	AUTHOR="Martin Brozkeff Malec - https://brozkeff.net"
	LICENSE="MIT (https://opensource.org/license/mit/)"
	# Shared on https://gist.github.com/brozkeff/76a8c537b15458a58355949c4c70da08

	echo "${SCRIPT_NAME} ${VERSION_NUMBER}"
	echo "Author: ${AUTHOR}"
	echo "License: ${LICENSE}"

	# This script performs OCR on a PDF file with optional compression to jbig2 lossy and saves the result
	# with a '-OCR' suffix. It supports multiple languages, with packages for Czech, English, and Tibetan required.
	# Before running, ensure the necessary Tesseract OCR language packages are installed.

	# Language packages needed: tesseract-ocr-ces (Czech), tesseract-ocr-eng (English), tesseract-ocr-script-tibt (Tibetan),
	# and the main package tesseract-ocr. The jbig2enc package is also required for jbig2 lossy compression.

	# Languages to process: To add or remove languages, modify the LANGS variable and REQUIRED_PKGS for Tesseract packages.
	# Optimization level: default is 1. Options are 0-3. For lossy compression, 2-3 is required.
	# REDO_OCR is 1 to force making new OCR.
	LANGS="ces+eng+Tibetan"
	REQUIRED_PKGS=("tesseract-ocr" "tesseract-ocr-ces" "tesseract-ocr-eng" "tesseract-ocr-script-tibt" "jbig2enc")
	OPT_LEVEL="3"
	JBIG2_LOSSY="1"
	REDO_OCR="1"

	echo "Script to process PDF file with Tesseract OCR and compress using jbig2."

	MISSING_PKGS=()

	for pkg in "${REQUIRED_PKGS[@]}"; do
	if ! dpkg -l \| grep -qw "$pkg"; then
	MISSING_PKGS+=("$pkg")
	fi
	done

	if [ ${#MISSING_PKGS[@]} -ne 0 ]; then
	for missing_pkg in "${MISSING_PKGS[@]}"; do
	if [ "$missing_pkg" == "jbig2enc" ]; then
	echo "Warning: 'jbig2enc' package is missing. It is not available in default Debian/Ubuntu repositories."
	echo "You may add the 'https://notesalexp.org/' repository to your APT sources to install 'jbig2enc'."
	echo "Note: The absence of 'jbig2enc' might affect lossy compression capabilities."
	else
	echo "Missing package: $missing_pkg"
	echo "Attempting to install missing Tesseract OCR packages. Please provide your password if prompted."
	sudo apt-get update
	sudo apt-get install -y "$missing_pkg"
	fi
	done
	fi

	# Check if a filename is provided
	if [ -z "$1" ]; then
	echo "No filename provided. Exiting..."
	exit 1
	fi

	# Define an array to hold ocrmypdf options
	OCR_OPTIONS=(-l "$LANGS" -O"$OPT_LEVEL")
	if [ "$JBIG2_LOSSY" -eq "1" ]; then
	OCR_OPTIONS+=("--jbig2-lossy")
	fi
	if [ "$REDO_OCR" -eq "1" ]; then
	OCR_OPTIONS+=("--redo-ocr")
	fi

	# Perform OCR using ocrmypdf
	if ocrmypdf "${OCR_OPTIONS[@]}" "$1" "${1%.pdf}-OCR.pdf"; then
	echo "OCR completed successfully. Output saved to '${1%.pdf}-OCR.pdf'."
	else
	ERROR_CODE=$?
	echo "OCR failed with error code $ERROR_CODE. Please check the input file and try again."
	fi