Skip to content

Instantly share code, notes, and snippets.

@raczben
Last active March 27, 2025 16:18
Show Gist options
  • Save raczben/577c393586e2b9e7bc7a190746a6ed90 to your computer and use it in GitHub Desktop.
Save raczben/577c393586e2b9e7bc7a190746a6ed90 to your computer and use it in GitHub Desktop.
A simple python script to help Scan Tailor Advanced and OCRmyPDF
#!/usr/bin/env python3
import os
import sys
import subprocess
from PIL import Image # Convert tif to pdf
import os
import math
import glob
def convert_tif_to_pdf(input_folder, output_pdf):
# Get all .tif files in the input folder
tif_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.tif')]
if not tif_files:
print("No .tif files found in the directory.")
return
images = []
for file in sorted(tif_files):
img_path = os.path.join(input_folder, file)
img = Image.open(img_path) # .convert("RGB") file will be large
images.append(img)
# Save images as a single PDF
images[0].save(output_pdf, save_all=True, append_images=images[1:])
def main(input_pdf_file):
if not os.path.isfile(input_pdf_file):
print("Error: File does not exist.")
sys.exit(1)
# Get the filename without extension
filename = os.path.splitext(os.path.basename(input_pdf_file))[0]
_, pdf_file = os.path.split(input_pdf_file)
# Create a working directory based on the filename
work_dir = os.path.join(os.getcwd(), filename)
print(f'Create a working directory: {work_dir}')
os.makedirs(work_dir, exist_ok=True)
# Copy the PDF to the working directory
print(f'Copy the PDF to the working directory')
pdf_path = os.path.join(work_dir, os.path.basename(input_pdf_file))
subprocess.run(["cp", input_pdf_file, pdf_path], check=True)
# Change to the working directory
os.chdir(work_dir)
# Check pdf size
print(f'Check pdf size...')
NoFPages = 9999999
threads = 1
try:
import PyPDF2
pdfReader = PyPDF2.PdfReader(open(pdf_file, 'rb'))
NoFPages = len(pdfReader.pages)
if NoFPages > 20: # do multi thread if it is not too small
threads = 4
print(f'NoFPages: {NoFPages} threads: {threads}')
except ImportError:
print(f'PyPDF2 is not installed (use pip install PyPDF2) --> one thread')
except Exception as ex:
print(f'{ex} Error calling PdfReader --> one thread')
jobs_per_thread = math.ceil(NoFPages / threads)
processes = []
for i in range(threads):
jobs_from = i * jobs_per_thread + 1
jobs_to = min((i + 1) * jobs_per_thread, NoFPages)
# Prepare the subprocess command
command = ["pdftoppm", pdf_file, "out", "-png",
f"-f", str(jobs_from),
f"-l", str(jobs_to)
]
# Start the subprocess
process = subprocess.Popen(command)
processes.append(process)
# Wait for all processes to complete
for process in processes:
process.wait()
print(f'The max recommended resolution is 1000x2000 (for single pages) and 2000x4000 for double pages.')
ans = input('Do you want to reduce the images? [n, percent-of-the-redution] ')
if ans != 'n':
percent = int(ans)
if percent < 10 or percent > 100:
print(f'Error: the percent must be between 10 and 90')
sys.exit(1)
print(f'Reduce the images by {percent}%')
for img_path in glob.glob('out-*.png'):
img = Image.open(img_path)
width, height = img.size
new_width = int(width * percent / 100)
new_height = int(height * percent / 100)
img = img.resize((new_width, new_height))
img.save(img_path)
_ = input("If ScanTailor Advanced is finisched the process press enter...")
print("convert_tif_to_pdf")
pdfc = f"{filename}-c.pdf"
pdfocr = f"{filename}-ocr.pdf"
convert_tif_to_pdf('out', pdfc)
print(f'ocrmypdf...')
command = [
"docker", "run", "-i", "ocrmypdf-165-ita", "-l", "ita", "-", "-"
]
with open(pdfc, "rb") as infile, open(pdfocr, "wb") as outfile:
process = subprocess.run(command, stdin=infile, stdout=outfile)
if process.returncode == 0:
print(f"OCR processed successfully. Output saved to {pdfocr}")
else:
raise ValueError("Error")
print("Processing completed. Output files are in:", work_dir)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <pdf_file>")
sys.exit(1)
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment