revilowaldow · December 7, 2024 14:19
diff --git a/pdf-extract-images.py b/pdf-extract-images.py
 #! /usr/bin/python3

 # IMPORTANT! This script requires a Unix based system. Such as Mac, Ubuntu, or Windows Subsystem for Linux (WSL) running under Windows. You cannot use it in Windows alone.
 # The reason for this is that the poppler-utils dependency does not have all the required features on Windows.

 # This script requires poppler-utils (containing: pdfimage, pdftoppm), imagemagick (containing: convert), and alive-progress (containing: alive_bar)
 # If your pdf file contains jp2 images, you can either install libopenjp2-tools (containing: opj_decompress), or uncomment the line that indicates a large performance hit

 # Example usage: python3 pdf-extract-images.py "Players Handbook.pdf" "PHB"

 # Raw images will be written to <OUTPUT_DIR>/15-organized
 # Attempts at merging masks and images will be output in webp to <OUTPUT_DIR/30-masked>
 # Images without corresponding masks will be written in webp to <OUTPUT_DIR>/40-standalone
 # A cover image and an associated 5etools thumbnail will be written to <OUTPUT_DIR>

 # Re-rewritten from https://gist.github.com/XBigTK13X/4796a0ca7f16e83438914384a57dc46b
 # Originally rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359

 import os
 import sys
 import subprocess
 import shutil
 from alive_progress import alive_bar
 from multiprocessing import Pool
 import time

 QUIET = False # Reduced console logging and no progress bars, note this does not remove the need for the progress bar dependency
 QUALITY = 85  # TheGiddyLimit's preferred webp quality level for 5etools
 SKIP_EXTRACT = False # Skip the extraction stage and work with files already in the output dir
 DEL_EXTRACT = False # After finishing extraction delete all the raw images, you'll have to extract again if you need even one raw image

 # List of composition methods to try for image and mask pairs 
 COMPOSITIONS = [
    "CopyOpacity",
 ]

 # Utility function for console logging
 def log(message):
    global QUIET
    if not QUIET:
        print(message)

 # Read in Arguments: pdf-extract-images.py [filename] [output dir] [comma separated list of composition modes or "all"] [# of sample images] [quiet?]

 if len(sys.argv) >= 6:
    QUIET = True

 if len(sys.argv) < 2:
    print("An input PDF file is required")
    sys.exit(1)

 if len(sys.argv) < 3:
    print("An output directory is required")
    sys.exit(1)

 if len(sys.argv) < 4 or sys.argv[3] == "all":
    log("Will only attempt CopyOpacity composition")
 else:
    log(f"Will attempt [{sys.argv[3]}] compositions")
    COMPOSITIONS = sys.argv[3].split(",")

 SAMPLE_IMAGE_NUM = 1
 if len(sys.argv) >= 5:
    log(f"Will copy samples using image [{sys.argv[4]}]")
    SAMPLE_IMAGE_NUM = int(sys.argv[4])

 INPUT_PDF_FILE = sys.argv[1]
 OUTPUT_DIR = sys.argv[2]

 # Delete everything in the output folder and overwrite... (not recommended)
 # if os.path.exists(OUTPUT_DIR):
 #     shutil.rmtree(OUTPUT_DIR) # DANGER! removes everything in the output folder

 # Utility function to run system commands with error feedback
 def execute(command):
    process = subprocess.Popen(
        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    stdout, stderr = process.communicate()
    result = process.returncode
    if result != 0:
        print("An error occurred while running {}".format(command))
        print("stdout: {}".format(stdout))
        print("stderr: {}".format(stderr))
        sys.exit(1)
    return {
        "result": result,
        "stdout": stdout.decode("utf-8").split("\n"),
        "stderr": stderr.decode("utf-8").split("\n"),
    }

 # Utility function to watch an output directory and generate a progress bar from file count 
 def prog(max):
    max = max-3 # Ignore meta items
    with alive_bar(max, manual=True, disable=QUIET) as bar:
        while len(os.listdir(EXTRACT_DIR)) < max:
            time.sleep(.005)
            bar(len(os.listdir(EXTRACT_DIR))/max)

 # Returned table of metadata from pdfimages -list
 # See https://linuxcommandlibrary.com/man/pdfimages
 metadata_parts = [
    "page",
    "num",
    "type",
    "width",
    "height",
    "color",
    "comp",
    "bpc",
    "enc",
    "interop",
    "object",
    "id",
    "x_ppi",
    "y_ppi",
    "size",
    "ratio",
 ]

 # Image data class 
 class PdfImageMetadata:
    def __init__(self, text):
        global metadata_parts
        parts = text.split()
        for meta in metadata_parts:
            if len(parts) <= 0:
                break
            self.__setattr__(meta, parts.pop(0))
        self.num = int(self.num)
        self.object = int(self.object)

 pdf_objects = {}
 log("Parse PDF image metadata")
 command = f'pdfimages -list "{INPUT_PDF_FILE}"' 
 list_results = execute(command) # Generate the metadata for the pdf file
 count = 0
 for line in list_results["stdout"]:
    count += 1
    if count < 3:
        continue
    if len(line) <= 2:
        continue
    image = PdfImageMetadata(line)
    if not "image" in image.type and not "smask" in image.type:
        continue
    if not image.object in pdf_objects:
        pdf_objects[image.object] = {}
    pdf_objects[image.object][image.type] = image

 # Create a folder to extract to
 EXTRACT_DIR = os.path.join(OUTPUT_DIR, "10-extract")
 if not os.path.exists(EXTRACT_DIR):
    os.makedirs(EXTRACT_DIR, exist_ok=True) # It's fine if there's already one

 if not SKIP_EXTRACT: # Skip this step if needed
    pool = Pool() # Create the world's most unnecessary multithreaded pool
    log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
    log(f"This can take a long time. We expect [{count-3}] files in [{EXTRACT_DIR}] before continuing...")
    command = f'pdfimages "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command to extract raw images
    # command = f'pdfimages -png -tiff -j "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command that tolerates jp2; a significant performance hit and fidelity loss
    pool.apply_async(prog, [count]) # Start the progress bar
    pool.apply_async(execute, [command]).wait() # Start the extraction
    time.sleep(.1) #Leave time for the bar to notice the final image
    pool.terminate() # Kill the parallel pool
 else:
    log('Skipped image extraction')

 # JP2 conversion if dependency installed
 # log(f"Converting JP2 files, if present.")
 # for path, dirc, files in os.walk(EXTRACT_DIR):
 #     for name in files:
 #         if name.endswith(".jp2"):
 #             # If this line is giving you an error see intro on JP2 support
 #             command = f'opj_decompress -i {EXTRACT_DIR}/{name} -o {EXTRACT_DIR}/{name.rsplit(".", 1)[0]}.ppm'
 #             execute(command)
 #             os.remove(f"{EXTRACT_DIR}/{name}")
 # log(f"Converted JP2 files, if present.")

 # Make a big list of the images we've got to work with
 log("Gather extracted image paths")
 extracted_images = {}
 for root, dirs, files in os.walk(EXTRACT_DIR):
    for ff in files:
        image_num = int(ff.split("-")[1].split(".")[0])
        extracted_images[image_num] = os.path.join(root, ff)

 # Create the various folders, and variables to refer to them
 STANDALONE_DIR = os.path.join(OUTPUT_DIR, "40-standalone")
 os.makedirs(STANDALONE_DIR, exist_ok=True)

 MASKED_DIR = os.path.join(OUTPUT_DIR, "30-masked")
 os.makedirs(MASKED_DIR, exist_ok=True)

 SAMPLE_DIR = os.path.join(OUTPUT_DIR, "25-samples")
 os.makedirs(SAMPLE_DIR, exist_ok=True)

 ORGANIZE_DIR = os.path.join(OUTPUT_DIR, "15-organized")
 os.makedirs(ORGANIZE_DIR, exist_ok=True)

 RAW_MASK_DIR = os.path.join(ORGANIZE_DIR, "mask")
 os.makedirs(RAW_MASK_DIR, exist_ok=True)

 RAW_IMAGE_DIR = os.path.join(ORGANIZE_DIR, "image")
 os.makedirs(RAW_IMAGE_DIR)

 # Function to compose images and masks together
 def compose(image, mask, destination, mode, imageWidth, imageHeight):
    merged_dir = os.path.join(MASKED_DIR, mode)
    os.makedirs(merged_dir, exist_ok=True)
    merged_file = f"{destination:05d}.webp" # in webp
    merged_path = os.path.join(merged_dir, merged_file)
    # Imagemagick command with quality level, we also resize the mask to the same size as the image as some pdfs store lower size mask data, we also set the composition mode
    command = f'convert "{image}" \( "{mask}" -resize {imageWidth}x{imageHeight}! \) -compose {mode} -composite -quality {QUALITY} "{merged_path}"'
    execute(command)
    if destination == SAMPLE_IMAGE_NUM:
        sample_path = os.path.join(SAMPLE_DIR, f"{mode}-{destination:05d}.webp")
        shutil.copy(merged_path, sample_path) # copy it over if this was a sample image

 # Function to convert images without masks
 def convert(image, destination):
    merged_file = f"{destination:05d}.webp" # in webp
    merged_path = os.path.join(STANDALONE_DIR, merged_file)
    command = f'convert "{image}" -quality {QUALITY} "{merged_path}"' # Imagemagick command with quality level
    execute(command)

 # Work through the list of output files
 log("Merging masked images, copying standalone images")
 merged_count = 0
 standalone_count = 0
 images_counted = False
 mode_count = 0
 for mode in COMPOSITIONS:
    mode_count += 1
    log(f'Composing images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]')
    with alive_bar(len(pdf_objects), disable=QUIET) as bar: # With a nice progress bar
        for k, v in pdf_objects.items():
            if "smask" in v and "image" in v: # If a mask-image pair
                image = extracted_images[v["image"].num]
                mask = extracted_images[v["smask"].num]
                shutil.copy(image, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # Copy the image
                shutil.copy(mask, os.path.join(RAW_MASK_DIR, f"{v['smask'].num}.png")) # Copy the mask
                compose(
                    image,
                    mask,
                    v["image"].num, # need to name it
                    mode,
                    v["image"].width, # need to resize mask
                    v["image"].height,
                )
                if not images_counted:
                    merged_count += 1
                    bar() # Update progress bar
            elif "image" in v: # If no mask
                source = extracted_images[v["image"].num]
                shutil.copy(source, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # copy it
                convert(source, v["image"].num) # Convert it to webp and save in dir
                standalone_count += 1
                bar() # Update progress bar
    images_counted = True

 log(f"Raw images sorted in [{ORGANIZE_DIR}]")
 log(f"{merged_count} images masked in [{MASKED_DIR}]")
 log(f"{standalone_count} images with no mask converted to webp in [{STANDALONE_DIR}]")

 if DEL_EXTRACT: # If we want raw images deleted (not recommended if issues)
    shutil.rmtree(EXTRACT_DIR) # Remove the raw extracted files
    log(f"Cleanup action removed raw files in [{EXTRACT_DIR}]")

 log(f"Creating cover image in [{OUTPUT_DIR}]")
 # Extract the front page of the pdf as an image
 execute(f'pdftoppm -singlefile "{INPUT_PDF_FILE}" "{OUTPUT_DIR}/cover"')
 # Covert it to webp at quality
 execute(f'convert "{OUTPUT_DIR}/cover.ppm" -quality {QUALITY} {OUTPUT_DIR}/cover-full.webp && rm "{OUTPUT_DIR}/cover.ppm"')
 # Create a 300x300px padded transparent thumbnail of the front page at full quality
 execute(f'convert "{OUTPUT_DIR}/cover-full.webp" -quality 100 -resize 300x300 -background transparent -gravity center -extent 300x300 {OUTPUT_DIR}/cover.webp')
	#! /usr/bin/python3

	# IMPORTANT! This script requires a Unix based system. Such as Mac, Ubuntu, or Windows Subsystem for Linux (WSL) running under Windows. You cannot use it in Windows alone.
	# The reason for this is that the poppler-utils dependency does not have all the required features on Windows.

	# This script requires poppler-utils (containing: pdfimage, pdftoppm), imagemagick (containing: convert), and alive-progress (containing: alive_bar)
	# If your pdf file contains jp2 images, you can either install libopenjp2-tools (containing: opj_decompress), or uncomment the line that indicates a large performance hit

	# Example usage: python3 pdf-extract-images.py "Players Handbook.pdf" "PHB"

	# Raw images will be written to <OUTPUT_DIR>/15-organized
	# Attempts at merging masks and images will be output in webp to <OUTPUT_DIR/30-masked>
	# Images without corresponding masks will be written in webp to <OUTPUT_DIR>/40-standalone
	# A cover image and an associated 5etools thumbnail will be written to <OUTPUT_DIR>

	# Re-rewritten from https://gist.github.com/XBigTK13X/4796a0ca7f16e83438914384a57dc46b
	# Originally rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359

	import os
	import sys
	import subprocess
	import shutil
	from alive_progress import alive_bar
	from multiprocessing import Pool
	import time

	QUIET = False # Reduced console logging and no progress bars, note this does not remove the need for the progress bar dependency
	QUALITY = 85 # TheGiddyLimit's preferred webp quality level for 5etools
	SKIP_EXTRACT = False # Skip the extraction stage and work with files already in the output dir
	DEL_EXTRACT = False # After finishing extraction delete all the raw images, you'll have to extract again if you need even one raw image

	# List of composition methods to try for image and mask pairs
	COMPOSITIONS = [
	"CopyOpacity",
	]

	# Utility function for console logging
	def log(message):
	global QUIET
	if not QUIET:
	print(message)

	# Read in Arguments: pdf-extract-images.py [filename] [output dir] [comma separated list of composition modes or "all"] [# of sample images] [quiet?]

	if len(sys.argv) >= 6:
	QUIET = True

	if len(sys.argv) < 2:
	print("An input PDF file is required")
	sys.exit(1)

	if len(sys.argv) < 3:
	print("An output directory is required")
	sys.exit(1)

	if len(sys.argv) < 4 or sys.argv[3] == "all":
	log("Will only attempt CopyOpacity composition")
	else:
	log(f"Will attempt [{sys.argv[3]}] compositions")
	COMPOSITIONS = sys.argv[3].split(",")

	SAMPLE_IMAGE_NUM = 1
	if len(sys.argv) >= 5:
	log(f"Will copy samples using image [{sys.argv[4]}]")
	SAMPLE_IMAGE_NUM = int(sys.argv[4])

	INPUT_PDF_FILE = sys.argv[1]
	OUTPUT_DIR = sys.argv[2]

	# Delete everything in the output folder and overwrite... (not recommended)
	# if os.path.exists(OUTPUT_DIR):
	# shutil.rmtree(OUTPUT_DIR) # DANGER! removes everything in the output folder

	# Utility function to run system commands with error feedback
	def execute(command):
	process = subprocess.Popen(
	command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
	)
	stdout, stderr = process.communicate()
	result = process.returncode
	if result != 0:
	print("An error occurred while running {}".format(command))
	print("stdout: {}".format(stdout))
	print("stderr: {}".format(stderr))
	sys.exit(1)
	return {
	"result": result,
	"stdout": stdout.decode("utf-8").split("\n"),
	"stderr": stderr.decode("utf-8").split("\n"),
	}

	# Utility function to watch an output directory and generate a progress bar from file count
	def prog(max):
	max = max-3 # Ignore meta items
	with alive_bar(max, manual=True, disable=QUIET) as bar:
	while len(os.listdir(EXTRACT_DIR)) < max:
	time.sleep(.005)
	bar(len(os.listdir(EXTRACT_DIR))/max)

	# Returned table of metadata from pdfimages -list
	# See https://linuxcommandlibrary.com/man/pdfimages
	metadata_parts = [
	"page",
	"num",
	"type",
	"width",
	"height",
	"color",
	"comp",
	"bpc",
	"enc",
	"interop",
	"object",
	"id",
	"x_ppi",
	"y_ppi",
	"size",
	"ratio",
	]

	# Image data class
	class PdfImageMetadata:
	def __init__(self, text):
	global metadata_parts
	parts = text.split()
	for meta in metadata_parts:
	if len(parts) <= 0:
	break
	self.__setattr__(meta, parts.pop(0))
	self.num = int(self.num)
	self.object = int(self.object)

	pdf_objects = {}
	log("Parse PDF image metadata")
	command = f'pdfimages -list "{INPUT_PDF_FILE}"'
	list_results = execute(command) # Generate the metadata for the pdf file
	count = 0
	for line in list_results["stdout"]:
	count += 1
	if count < 3:
	continue
	if len(line) <= 2:
	continue
	image = PdfImageMetadata(line)
	if not "image" in image.type and not "smask" in image.type:
	continue
	if not image.object in pdf_objects:
	pdf_objects[image.object] = {}
	pdf_objects[image.object][image.type] = image

	# Create a folder to extract to
	EXTRACT_DIR = os.path.join(OUTPUT_DIR, "10-extract")
	if not os.path.exists(EXTRACT_DIR):
	os.makedirs(EXTRACT_DIR, exist_ok=True) # It's fine if there's already one

	if not SKIP_EXTRACT: # Skip this step if needed
	pool = Pool() # Create the world's most unnecessary multithreaded pool
	log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
	log(f"This can take a long time. We expect [{count-3}] files in [{EXTRACT_DIR}] before continuing...")
	command = f'pdfimages "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command to extract raw images
	# command = f'pdfimages -png -tiff -j "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command that tolerates jp2; a significant performance hit and fidelity loss
	pool.apply_async(prog, [count]) # Start the progress bar
	pool.apply_async(execute, [command]).wait() # Start the extraction
	time.sleep(.1) #Leave time for the bar to notice the final image
	pool.terminate() # Kill the parallel pool
	else:
	log('Skipped image extraction')

	# JP2 conversion if dependency installed
	# log(f"Converting JP2 files, if present.")
	# for path, dirc, files in os.walk(EXTRACT_DIR):
	# for name in files:
	# if name.endswith(".jp2"):
	# # If this line is giving you an error see intro on JP2 support
	# command = f'opj_decompress -i {EXTRACT_DIR}/{name} -o {EXTRACT_DIR}/{name.rsplit(".", 1)[0]}.ppm'
	# execute(command)
	# os.remove(f"{EXTRACT_DIR}/{name}")
	# log(f"Converted JP2 files, if present.")

	# Make a big list of the images we've got to work with
	log("Gather extracted image paths")
	extracted_images = {}
	for root, dirs, files in os.walk(EXTRACT_DIR):
	for ff in files:
	image_num = int(ff.split("-")[1].split(".")[0])
	extracted_images[image_num] = os.path.join(root, ff)

	# Create the various folders, and variables to refer to them
	STANDALONE_DIR = os.path.join(OUTPUT_DIR, "40-standalone")
	os.makedirs(STANDALONE_DIR, exist_ok=True)

	MASKED_DIR = os.path.join(OUTPUT_DIR, "30-masked")
	os.makedirs(MASKED_DIR, exist_ok=True)

	SAMPLE_DIR = os.path.join(OUTPUT_DIR, "25-samples")
	os.makedirs(SAMPLE_DIR, exist_ok=True)

	ORGANIZE_DIR = os.path.join(OUTPUT_DIR, "15-organized")
	os.makedirs(ORGANIZE_DIR, exist_ok=True)

	RAW_MASK_DIR = os.path.join(ORGANIZE_DIR, "mask")
	os.makedirs(RAW_MASK_DIR, exist_ok=True)

	RAW_IMAGE_DIR = os.path.join(ORGANIZE_DIR, "image")
	os.makedirs(RAW_IMAGE_DIR)

	# Function to compose images and masks together
	def compose(image, mask, destination, mode, imageWidth, imageHeight):
	merged_dir = os.path.join(MASKED_DIR, mode)
	os.makedirs(merged_dir, exist_ok=True)
	merged_file = f"{destination:05d}.webp" # in webp
	merged_path = os.path.join(merged_dir, merged_file)
	# Imagemagick command with quality level, we also resize the mask to the same size as the image as some pdfs store lower size mask data, we also set the composition mode
	command = f'convert "{image}" \( "{mask}" -resize {imageWidth}x{imageHeight}! \) -compose {mode} -composite -quality {QUALITY} "{merged_path}"'
	execute(command)
	if destination == SAMPLE_IMAGE_NUM:
	sample_path = os.path.join(SAMPLE_DIR, f"{mode}-{destination:05d}.webp")
	shutil.copy(merged_path, sample_path) # copy it over if this was a sample image

	# Function to convert images without masks
	def convert(image, destination):
	merged_file = f"{destination:05d}.webp" # in webp
	merged_path = os.path.join(STANDALONE_DIR, merged_file)
	command = f'convert "{image}" -quality {QUALITY} "{merged_path}"' # Imagemagick command with quality level
	execute(command)

	# Work through the list of output files
	log("Merging masked images, copying standalone images")
	merged_count = 0
	standalone_count = 0
	images_counted = False
	mode_count = 0
	for mode in COMPOSITIONS:
	mode_count += 1
	log(f'Composing images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]')
	with alive_bar(len(pdf_objects), disable=QUIET) as bar: # With a nice progress bar
	for k, v in pdf_objects.items():
	if "smask" in v and "image" in v: # If a mask-image pair
	image = extracted_images[v["image"].num]
	mask = extracted_images[v["smask"].num]
	shutil.copy(image, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # Copy the image
	shutil.copy(mask, os.path.join(RAW_MASK_DIR, f"{v['smask'].num}.png")) # Copy the mask
	compose(
	image,
	mask,
	v["image"].num, # need to name it
	mode,
	v["image"].width, # need to resize mask
	v["image"].height,
	)
	if not images_counted:
	merged_count += 1
	bar() # Update progress bar
	elif "image" in v: # If no mask
	source = extracted_images[v["image"].num]
	shutil.copy(source, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # copy it
	convert(source, v["image"].num) # Convert it to webp and save in dir
	standalone_count += 1
	bar() # Update progress bar
	images_counted = True

	log(f"Raw images sorted in [{ORGANIZE_DIR}]")
	log(f"{merged_count} images masked in [{MASKED_DIR}]")
	log(f"{standalone_count} images with no mask converted to webp in [{STANDALONE_DIR}]")

	if DEL_EXTRACT: # If we want raw images deleted (not recommended if issues)
	shutil.rmtree(EXTRACT_DIR) # Remove the raw extracted files
	log(f"Cleanup action removed raw files in [{EXTRACT_DIR}]")

	log(f"Creating cover image in [{OUTPUT_DIR}]")
	# Extract the front page of the pdf as an image
	execute(f'pdftoppm -singlefile "{INPUT_PDF_FILE}" "{OUTPUT_DIR}/cover"')
	# Covert it to webp at quality
	execute(f'convert "{OUTPUT_DIR}/cover.ppm" -quality {QUALITY} {OUTPUT_DIR}/cover-full.webp && rm "{OUTPUT_DIR}/cover.ppm"')
	# Create a 300x300px padded transparent thumbnail of the front page at full quality
	execute(f'convert "{OUTPUT_DIR}/cover-full.webp" -quality 100 -resize 300x300 -background transparent -gravity center -extent 300x300 {OUTPUT_DIR}/cover.webp')