raczben · March 27, 2025 16:18
diff --git a/smartpdf.py b/smartpdf.py
 #!/usr/bin/env python3

 import os
 import sys
 import subprocess
 from PIL import Image # Convert tif to pdf
 import os
 import math
 import glob

 def convert_tif_to_pdf(input_folder, output_pdf):
    # Get all .tif files in the input folder
    tif_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.tif')]
    
    if not tif_files:
        print("No .tif files found in the directory.")
        return
    
    images = []
    for file in sorted(tif_files):
        img_path = os.path.join(input_folder, file)
        img = Image.open(img_path) # .convert("RGB")  file will be large 
        images.append(img)
    
    # Save images as a single PDF
    images[0].save(output_pdf, save_all=True, append_images=images[1:])

 def main(input_pdf_file):
    if not os.path.isfile(input_pdf_file):
        print("Error: File does not exist.")
        sys.exit(1)
    
    # Get the filename without extension
    filename = os.path.splitext(os.path.basename(input_pdf_file))[0]
    _, pdf_file = os.path.split(input_pdf_file)
    
    # Create a working directory based on the filename
    work_dir = os.path.join(os.getcwd(), filename)
    print(f'Create a working directory: {work_dir}')
    os.makedirs(work_dir, exist_ok=True)
    
    # Copy the PDF to the working directory
    print(f'Copy the PDF to the working directory')
    pdf_path = os.path.join(work_dir, os.path.basename(input_pdf_file))
    subprocess.run(["cp", input_pdf_file, pdf_path], check=True)
    
    # Change to the working directory
    os.chdir(work_dir)
    
    # Check pdf size
    print(f'Check pdf size...')
    NoFPages = 9999999
    threads = 1
    try:
        import PyPDF2
        pdfReader = PyPDF2.PdfReader(open(pdf_file, 'rb'))
        NoFPages = len(pdfReader.pages)
        if NoFPages > 20: # do multi thread if it is not too small 
            threads = 4
        print(f'NoFPages: {NoFPages}   threads: {threads}')
    except ImportError:
        print(f'PyPDF2 is not installed (use pip install PyPDF2) --> one thread')
    except Exception as ex: 
        print(f'{ex}   Error calling PdfReader --> one thread')

    jobs_per_thread = math.ceil(NoFPages / threads)
    processes = []
    for i in range(threads):
        jobs_from = i * jobs_per_thread + 1
        jobs_to = min((i + 1) * jobs_per_thread, NoFPages)

        # Prepare the subprocess command
        command = ["pdftoppm", pdf_file, "out", "-png",
            f"-f", str(jobs_from),
            f"-l", str(jobs_to)
        ]

        # Start the subprocess
        process = subprocess.Popen(command)
        processes.append(process)

    # Wait for all processes to complete
    for process in processes:
        process.wait()
    
    print(f'The max recommended resolution is 1000x2000 (for single pages) and 2000x4000 for double pages.')
    ans = input('Do you want to reduce the images? [n, percent-of-the-redution] ')
    if ans != 'n':
        percent = int(ans)
        if percent < 10 or percent > 100:
            print(f'Error: the percent must be between 10 and 90')
            sys.exit(1)
        print(f'Reduce the images by {percent}%')
        for img_path in glob.glob('out-*.png'):
            img = Image.open(img_path)
            width, height = img.size
            new_width = int(width * percent / 100)
            new_height = int(height * percent / 100)
            img = img.resize((new_width, new_height))
            img.save(img_path)

    _ = input("If ScanTailor Advanced is finisched the process press enter...")

    print("convert_tif_to_pdf")
    pdfc = f"{filename}-c.pdf"
    pdfocr = f"{filename}-ocr.pdf"
    convert_tif_to_pdf('out', pdfc)

    print(f'ocrmypdf...')
    command = [
        "docker", "run", "-i", "ocrmypdf-165-ita", "-l", "ita", "-", "-"
    ]
    with open(pdfc, "rb") as infile, open(pdfocr, "wb") as outfile:
        process = subprocess.run(command, stdin=infile, stdout=outfile)

        if process.returncode == 0:
            print(f"OCR processed successfully. Output saved to {pdfocr}")
        else:
            raise ValueError("Error")
    print("Processing completed. Output files are in:", work_dir)

 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <pdf_file>")
        sys.exit(1)
    
    main(sys.argv[1])
	#!/usr/bin/env python3

	import os
	import sys
	import subprocess
	from PIL import Image # Convert tif to pdf
	import os
	import math
	import glob

	def convert_tif_to_pdf(input_folder, output_pdf):
	# Get all .tif files in the input folder
	tif_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.tif')]

	if not tif_files:
	print("No .tif files found in the directory.")
	return

	images = []
	for file in sorted(tif_files):
	img_path = os.path.join(input_folder, file)
	img = Image.open(img_path) # .convert("RGB") file will be large
	images.append(img)

	# Save images as a single PDF
	images[0].save(output_pdf, save_all=True, append_images=images[1:])

	def main(input_pdf_file):
	if not os.path.isfile(input_pdf_file):
	print("Error: File does not exist.")
	sys.exit(1)

	# Get the filename without extension
	filename = os.path.splitext(os.path.basename(input_pdf_file))[0]
	_, pdf_file = os.path.split(input_pdf_file)

	# Create a working directory based on the filename
	work_dir = os.path.join(os.getcwd(), filename)
	print(f'Create a working directory: {work_dir}')
	os.makedirs(work_dir, exist_ok=True)

	# Copy the PDF to the working directory
	print(f'Copy the PDF to the working directory')
	pdf_path = os.path.join(work_dir, os.path.basename(input_pdf_file))
	subprocess.run(["cp", input_pdf_file, pdf_path], check=True)

	# Change to the working directory
	os.chdir(work_dir)

	# Check pdf size
	print(f'Check pdf size...')
	NoFPages = 9999999
	threads = 1
	try:
	import PyPDF2
	pdfReader = PyPDF2.PdfReader(open(pdf_file, 'rb'))
	NoFPages = len(pdfReader.pages)
	if NoFPages > 20: # do multi thread if it is not too small
	threads = 4
	print(f'NoFPages: {NoFPages} threads: {threads}')
	except ImportError:
	print(f'PyPDF2 is not installed (use pip install PyPDF2) --> one thread')
	except Exception as ex:
	print(f'{ex} Error calling PdfReader --> one thread')

	jobs_per_thread = math.ceil(NoFPages / threads)
	processes = []
	for i in range(threads):
	jobs_from = i * jobs_per_thread + 1
	jobs_to = min((i + 1) * jobs_per_thread, NoFPages)

	# Prepare the subprocess command
	command = ["pdftoppm", pdf_file, "out", "-png",
	f"-f", str(jobs_from),
	f"-l", str(jobs_to)
	]

	# Start the subprocess
	process = subprocess.Popen(command)
	processes.append(process)

	# Wait for all processes to complete
	for process in processes:
	process.wait()

	print(f'The max recommended resolution is 1000x2000 (for single pages) and 2000x4000 for double pages.')
	ans = input('Do you want to reduce the images? [n, percent-of-the-redution] ')
	if ans != 'n':
	percent = int(ans)
	if percent < 10 or percent > 100:
	print(f'Error: the percent must be between 10 and 90')
	sys.exit(1)
	print(f'Reduce the images by {percent}%')
	for img_path in glob.glob('out-*.png'):
	img = Image.open(img_path)
	width, height = img.size
	new_width = int(width * percent / 100)
	new_height = int(height * percent / 100)
	img = img.resize((new_width, new_height))
	img.save(img_path)

	_ = input("If ScanTailor Advanced is finisched the process press enter...")

	print("convert_tif_to_pdf")
	pdfc = f"{filename}-c.pdf"
	pdfocr = f"{filename}-ocr.pdf"
	convert_tif_to_pdf('out', pdfc)

	print(f'ocrmypdf...')
	command = [
	"docker", "run", "-i", "ocrmypdf-165-ita", "-l", "ita", "-", "-"
	]
	with open(pdfc, "rb") as infile, open(pdfocr, "wb") as outfile:
	process = subprocess.run(command, stdin=infile, stdout=outfile)

	if process.returncode == 0:
	print(f"OCR processed successfully. Output saved to {pdfocr}")
	else:
	raise ValueError("Error")
	print("Processing completed. Output files are in:", work_dir)

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python script.py <pdf_file>")
	sys.exit(1)

	main(sys.argv[1])