Skip to content

Instantly share code, notes, and snippets.

@revilowaldow
Forked from XBigTK13X/pdf-extract-images.py
Last active December 7, 2024 14:19
Show Gist options
  • Save revilowaldow/2d7a551685c5198fea42b285a2e30223 to your computer and use it in GitHub Desktop.
Save revilowaldow/2d7a551685c5198fea42b285a2e30223 to your computer and use it in GitHub Desktop.
A 5etools focused script to automate pdf image extraction and masking. Also generates a cover thumbnail
#! /usr/bin/python3
# IMPORTANT! This script requires a Unix based system. Such as Mac, Ubuntu, or Windows Subsystem for Linux (WSL) running under Windows. You cannot use it in Windows alone.
# The reason for this is that the poppler-utils dependency does not have all the required features on Windows.
# This script requires poppler-utils (containing: pdfimage, pdftoppm), imagemagick (containing: convert), and alive-progress (containing: alive_bar)
# If your pdf file contains jp2 images, you can either install libopenjp2-tools (containing: opj_decompress), or uncomment the line that indicates a large performance hit
# Example usage: python3 pdf-extract-images.py "Players Handbook.pdf" "PHB"
# Raw images will be written to <OUTPUT_DIR>/15-organized
# Attempts at merging masks and images will be output in webp to <OUTPUT_DIR/30-masked>
# Images without corresponding masks will be written in webp to <OUTPUT_DIR>/40-standalone
# A cover image and an associated 5etools thumbnail will be written to <OUTPUT_DIR>
# Re-rewritten from https://gist.github.com/XBigTK13X/4796a0ca7f16e83438914384a57dc46b
# Originally rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359
import os
import sys
import subprocess
import shutil
from alive_progress import alive_bar
from multiprocessing import Pool
import time
QUIET = False # Reduced console logging and no progress bars, note this does not remove the need for the progress bar dependency
QUALITY = 85 # TheGiddyLimit's preferred webp quality level for 5etools
SKIP_EXTRACT = False # Skip the extraction stage and work with files already in the output dir
DEL_EXTRACT = False # After finishing extraction delete all the raw images, you'll have to extract again if you need even one raw image
# List of composition methods to try for image and mask pairs
COMPOSITIONS = [
"CopyOpacity",
]
# Utility function for console logging
def log(message):
global QUIET
if not QUIET:
print(message)
# Read in Arguments: pdf-extract-images.py [filename] [output dir] [comma separated list of composition modes or "all"] [# of sample images] [quiet?]
if len(sys.argv) >= 6:
QUIET = True
if len(sys.argv) < 2:
print("An input PDF file is required")
sys.exit(1)
if len(sys.argv) < 3:
print("An output directory is required")
sys.exit(1)
if len(sys.argv) < 4 or sys.argv[3] == "all":
log("Will only attempt CopyOpacity composition")
else:
log(f"Will attempt [{sys.argv[3]}] compositions")
COMPOSITIONS = sys.argv[3].split(",")
SAMPLE_IMAGE_NUM = 1
if len(sys.argv) >= 5:
log(f"Will copy samples using image [{sys.argv[4]}]")
SAMPLE_IMAGE_NUM = int(sys.argv[4])
INPUT_PDF_FILE = sys.argv[1]
OUTPUT_DIR = sys.argv[2]
# Delete everything in the output folder and overwrite... (not recommended)
# if os.path.exists(OUTPUT_DIR):
# shutil.rmtree(OUTPUT_DIR) # DANGER! removes everything in the output folder
# Utility function to run system commands with error feedback
def execute(command):
process = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
result = process.returncode
if result != 0:
print("An error occurred while running {}".format(command))
print("stdout: {}".format(stdout))
print("stderr: {}".format(stderr))
sys.exit(1)
return {
"result": result,
"stdout": stdout.decode("utf-8").split("\n"),
"stderr": stderr.decode("utf-8").split("\n"),
}
# Utility function to watch an output directory and generate a progress bar from file count
def prog(max):
max = max-3 # Ignore meta items
with alive_bar(max, manual=True, disable=QUIET) as bar:
while len(os.listdir(EXTRACT_DIR)) < max:
time.sleep(.005)
bar(len(os.listdir(EXTRACT_DIR))/max)
# Returned table of metadata from pdfimages -list
# See https://linuxcommandlibrary.com/man/pdfimages
metadata_parts = [
"page",
"num",
"type",
"width",
"height",
"color",
"comp",
"bpc",
"enc",
"interop",
"object",
"id",
"x_ppi",
"y_ppi",
"size",
"ratio",
]
# Image data class
class PdfImageMetadata:
def __init__(self, text):
global metadata_parts
parts = text.split()
for meta in metadata_parts:
if len(parts) <= 0:
break
self.__setattr__(meta, parts.pop(0))
self.num = int(self.num)
self.object = int(self.object)
pdf_objects = {}
log("Parse PDF image metadata")
command = f'pdfimages -list "{INPUT_PDF_FILE}"'
list_results = execute(command) # Generate the metadata for the pdf file
count = 0
for line in list_results["stdout"]:
count += 1
if count < 3:
continue
if len(line) <= 2:
continue
image = PdfImageMetadata(line)
if not "image" in image.type and not "smask" in image.type:
continue
if not image.object in pdf_objects:
pdf_objects[image.object] = {}
pdf_objects[image.object][image.type] = image
# Create a folder to extract to
EXTRACT_DIR = os.path.join(OUTPUT_DIR, "10-extract")
if not os.path.exists(EXTRACT_DIR):
os.makedirs(EXTRACT_DIR, exist_ok=True) # It's fine if there's already one
if not SKIP_EXTRACT: # Skip this step if needed
pool = Pool() # Create the world's most unnecessary multithreaded pool
log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
log(f"This can take a long time. We expect [{count-3}] files in [{EXTRACT_DIR}] before continuing...")
command = f'pdfimages "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command to extract raw images
# command = f'pdfimages -png -tiff -j "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command that tolerates jp2; a significant performance hit and fidelity loss
pool.apply_async(prog, [count]) # Start the progress bar
pool.apply_async(execute, [command]).wait() # Start the extraction
time.sleep(.1) #Leave time for the bar to notice the final image
pool.terminate() # Kill the parallel pool
else:
log('Skipped image extraction')
# JP2 conversion if dependency installed
# log(f"Converting JP2 files, if present.")
# for path, dirc, files in os.walk(EXTRACT_DIR):
# for name in files:
# if name.endswith(".jp2"):
# # If this line is giving you an error see intro on JP2 support
# command = f'opj_decompress -i {EXTRACT_DIR}/{name} -o {EXTRACT_DIR}/{name.rsplit(".", 1)[0]}.ppm'
# execute(command)
# os.remove(f"{EXTRACT_DIR}/{name}")
# log(f"Converted JP2 files, if present.")
# Make a big list of the images we've got to work with
log("Gather extracted image paths")
extracted_images = {}
for root, dirs, files in os.walk(EXTRACT_DIR):
for ff in files:
image_num = int(ff.split("-")[1].split(".")[0])
extracted_images[image_num] = os.path.join(root, ff)
# Create the various folders, and variables to refer to them
STANDALONE_DIR = os.path.join(OUTPUT_DIR, "40-standalone")
os.makedirs(STANDALONE_DIR, exist_ok=True)
MASKED_DIR = os.path.join(OUTPUT_DIR, "30-masked")
os.makedirs(MASKED_DIR, exist_ok=True)
SAMPLE_DIR = os.path.join(OUTPUT_DIR, "25-samples")
os.makedirs(SAMPLE_DIR, exist_ok=True)
ORGANIZE_DIR = os.path.join(OUTPUT_DIR, "15-organized")
os.makedirs(ORGANIZE_DIR, exist_ok=True)
RAW_MASK_DIR = os.path.join(ORGANIZE_DIR, "mask")
os.makedirs(RAW_MASK_DIR, exist_ok=True)
RAW_IMAGE_DIR = os.path.join(ORGANIZE_DIR, "image")
os.makedirs(RAW_IMAGE_DIR)
# Function to compose images and masks together
def compose(image, mask, destination, mode, imageWidth, imageHeight):
merged_dir = os.path.join(MASKED_DIR, mode)
os.makedirs(merged_dir, exist_ok=True)
merged_file = f"{destination:05d}.webp" # in webp
merged_path = os.path.join(merged_dir, merged_file)
# Imagemagick command with quality level, we also resize the mask to the same size as the image as some pdfs store lower size mask data, we also set the composition mode
command = f'convert "{image}" \( "{mask}" -resize {imageWidth}x{imageHeight}! \) -compose {mode} -composite -quality {QUALITY} "{merged_path}"'
execute(command)
if destination == SAMPLE_IMAGE_NUM:
sample_path = os.path.join(SAMPLE_DIR, f"{mode}-{destination:05d}.webp")
shutil.copy(merged_path, sample_path) # copy it over if this was a sample image
# Function to convert images without masks
def convert(image, destination):
merged_file = f"{destination:05d}.webp" # in webp
merged_path = os.path.join(STANDALONE_DIR, merged_file)
command = f'convert "{image}" -quality {QUALITY} "{merged_path}"' # Imagemagick command with quality level
execute(command)
# Work through the list of output files
log("Merging masked images, copying standalone images")
merged_count = 0
standalone_count = 0
images_counted = False
mode_count = 0
for mode in COMPOSITIONS:
mode_count += 1
log(f'Composing images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]')
with alive_bar(len(pdf_objects), disable=QUIET) as bar: # With a nice progress bar
for k, v in pdf_objects.items():
if "smask" in v and "image" in v: # If a mask-image pair
image = extracted_images[v["image"].num]
mask = extracted_images[v["smask"].num]
shutil.copy(image, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # Copy the image
shutil.copy(mask, os.path.join(RAW_MASK_DIR, f"{v['smask'].num}.png")) # Copy the mask
compose(
image,
mask,
v["image"].num, # need to name it
mode,
v["image"].width, # need to resize mask
v["image"].height,
)
if not images_counted:
merged_count += 1
bar() # Update progress bar
elif "image" in v: # If no mask
source = extracted_images[v["image"].num]
shutil.copy(source, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # copy it
convert(source, v["image"].num) # Convert it to webp and save in dir
standalone_count += 1
bar() # Update progress bar
images_counted = True
log(f"Raw images sorted in [{ORGANIZE_DIR}]")
log(f"{merged_count} images masked in [{MASKED_DIR}]")
log(f"{standalone_count} images with no mask converted to webp in [{STANDALONE_DIR}]")
if DEL_EXTRACT: # If we want raw images deleted (not recommended if issues)
shutil.rmtree(EXTRACT_DIR) # Remove the raw extracted files
log(f"Cleanup action removed raw files in [{EXTRACT_DIR}]")
log(f"Creating cover image in [{OUTPUT_DIR}]")
# Extract the front page of the pdf as an image
execute(f'pdftoppm -singlefile "{INPUT_PDF_FILE}" "{OUTPUT_DIR}/cover"')
# Covert it to webp at quality
execute(f'convert "{OUTPUT_DIR}/cover.ppm" -quality {QUALITY} {OUTPUT_DIR}/cover-full.webp && rm "{OUTPUT_DIR}/cover.ppm"')
# Create a 300x300px padded transparent thumbnail of the front page at full quality
execute(f'convert "{OUTPUT_DIR}/cover-full.webp" -quality 100 -resize 300x300 -background transparent -gravity center -extent 300x300 {OUTPUT_DIR}/cover.webp')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment