Last active
March 27, 2025 16:18
-
-
Save raczben/577c393586e2b9e7bc7a190746a6ed90 to your computer and use it in GitHub Desktop.
A simple python script to help Scan Tailor Advanced and OCRmyPDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import subprocess | |
from PIL import Image # Convert tif to pdf | |
import os | |
import math | |
import glob | |
def convert_tif_to_pdf(input_folder, output_pdf): | |
# Get all .tif files in the input folder | |
tif_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.tif')] | |
if not tif_files: | |
print("No .tif files found in the directory.") | |
return | |
images = [] | |
for file in sorted(tif_files): | |
img_path = os.path.join(input_folder, file) | |
img = Image.open(img_path) # .convert("RGB") file will be large | |
images.append(img) | |
# Save images as a single PDF | |
images[0].save(output_pdf, save_all=True, append_images=images[1:]) | |
def main(input_pdf_file): | |
if not os.path.isfile(input_pdf_file): | |
print("Error: File does not exist.") | |
sys.exit(1) | |
# Get the filename without extension | |
filename = os.path.splitext(os.path.basename(input_pdf_file))[0] | |
_, pdf_file = os.path.split(input_pdf_file) | |
# Create a working directory based on the filename | |
work_dir = os.path.join(os.getcwd(), filename) | |
print(f'Create a working directory: {work_dir}') | |
os.makedirs(work_dir, exist_ok=True) | |
# Copy the PDF to the working directory | |
print(f'Copy the PDF to the working directory') | |
pdf_path = os.path.join(work_dir, os.path.basename(input_pdf_file)) | |
subprocess.run(["cp", input_pdf_file, pdf_path], check=True) | |
# Change to the working directory | |
os.chdir(work_dir) | |
# Check pdf size | |
print(f'Check pdf size...') | |
NoFPages = 9999999 | |
threads = 1 | |
try: | |
import PyPDF2 | |
pdfReader = PyPDF2.PdfReader(open(pdf_file, 'rb')) | |
NoFPages = len(pdfReader.pages) | |
if NoFPages > 20: # do multi thread if it is not too small | |
threads = 4 | |
print(f'NoFPages: {NoFPages} threads: {threads}') | |
except ImportError: | |
print(f'PyPDF2 is not installed (use pip install PyPDF2) --> one thread') | |
except Exception as ex: | |
print(f'{ex} Error calling PdfReader --> one thread') | |
jobs_per_thread = math.ceil(NoFPages / threads) | |
processes = [] | |
for i in range(threads): | |
jobs_from = i * jobs_per_thread + 1 | |
jobs_to = min((i + 1) * jobs_per_thread, NoFPages) | |
# Prepare the subprocess command | |
command = ["pdftoppm", pdf_file, "out", "-png", | |
f"-f", str(jobs_from), | |
f"-l", str(jobs_to) | |
] | |
# Start the subprocess | |
process = subprocess.Popen(command) | |
processes.append(process) | |
# Wait for all processes to complete | |
for process in processes: | |
process.wait() | |
print(f'The max recommended resolution is 1000x2000 (for single pages) and 2000x4000 for double pages.') | |
ans = input('Do you want to reduce the images? [n, percent-of-the-redution] ') | |
if ans != 'n': | |
percent = int(ans) | |
if percent < 10 or percent > 100: | |
print(f'Error: the percent must be between 10 and 90') | |
sys.exit(1) | |
print(f'Reduce the images by {percent}%') | |
for img_path in glob.glob('out-*.png'): | |
img = Image.open(img_path) | |
width, height = img.size | |
new_width = int(width * percent / 100) | |
new_height = int(height * percent / 100) | |
img = img.resize((new_width, new_height)) | |
img.save(img_path) | |
_ = input("If ScanTailor Advanced is finisched the process press enter...") | |
print("convert_tif_to_pdf") | |
pdfc = f"{filename}-c.pdf" | |
pdfocr = f"{filename}-ocr.pdf" | |
convert_tif_to_pdf('out', pdfc) | |
print(f'ocrmypdf...') | |
command = [ | |
"docker", "run", "-i", "ocrmypdf-165-ita", "-l", "ita", "-", "-" | |
] | |
with open(pdfc, "rb") as infile, open(pdfocr, "wb") as outfile: | |
process = subprocess.run(command, stdin=infile, stdout=outfile) | |
if process.returncode == 0: | |
print(f"OCR processed successfully. Output saved to {pdfocr}") | |
else: | |
raise ValueError("Error") | |
print("Processing completed. Output files are in:", work_dir) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <pdf_file>") | |
sys.exit(1) | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment