-
-
Save revilowaldow/2d7a551685c5198fea42b285a2e30223 to your computer and use it in GitHub Desktop.
A 5etools focused script to automate pdf image extraction and masking. Also generates a cover thumbnail
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python3 | |
# IMPORTANT! This script requires a Unix based system. Such as Mac, Ubuntu, or Windows Subsystem for Linux (WSL) running under Windows. You cannot use it in Windows alone. | |
# The reason for this is that the poppler-utils dependency does not have all the required features on Windows. | |
# This script requires poppler-utils (containing: pdfimage, pdftoppm), imagemagick (containing: convert), and alive-progress (containing: alive_bar) | |
# If your pdf file contains jp2 images, you can either install libopenjp2-tools (containing: opj_decompress), or uncomment the line that indicates a large performance hit | |
# Example usage: python3 pdf-extract-images.py "Players Handbook.pdf" "PHB" | |
# Raw images will be written to <OUTPUT_DIR>/15-organized | |
# Attempts at merging masks and images will be output in webp to <OUTPUT_DIR/30-masked> | |
# Images without corresponding masks will be written in webp to <OUTPUT_DIR>/40-standalone | |
# A cover image and an associated 5etools thumbnail will be written to <OUTPUT_DIR> | |
# Re-rewritten from https://gist.github.com/XBigTK13X/4796a0ca7f16e83438914384a57dc46b | |
# Originally rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359 | |
import os | |
import sys | |
import subprocess | |
import shutil | |
from alive_progress import alive_bar | |
from multiprocessing import Pool | |
import time | |
QUIET = False # Reduced console logging and no progress bars, note this does not remove the need for the progress bar dependency | |
QUALITY = 85 # TheGiddyLimit's preferred webp quality level for 5etools | |
SKIP_EXTRACT = False # Skip the extraction stage and work with files already in the output dir | |
DEL_EXTRACT = False # After finishing extraction delete all the raw images, you'll have to extract again if you need even one raw image | |
# List of composition methods to try for image and mask pairs | |
COMPOSITIONS = [ | |
"CopyOpacity", | |
] | |
# Utility function for console logging | |
def log(message): | |
global QUIET | |
if not QUIET: | |
print(message) | |
# Read in Arguments: pdf-extract-images.py [filename] [output dir] [comma separated list of composition modes or "all"] [# of sample images] [quiet?] | |
if len(sys.argv) >= 6: | |
QUIET = True | |
if len(sys.argv) < 2: | |
print("An input PDF file is required") | |
sys.exit(1) | |
if len(sys.argv) < 3: | |
print("An output directory is required") | |
sys.exit(1) | |
if len(sys.argv) < 4 or sys.argv[3] == "all": | |
log("Will only attempt CopyOpacity composition") | |
else: | |
log(f"Will attempt [{sys.argv[3]}] compositions") | |
COMPOSITIONS = sys.argv[3].split(",") | |
SAMPLE_IMAGE_NUM = 1 | |
if len(sys.argv) >= 5: | |
log(f"Will copy samples using image [{sys.argv[4]}]") | |
SAMPLE_IMAGE_NUM = int(sys.argv[4]) | |
INPUT_PDF_FILE = sys.argv[1] | |
OUTPUT_DIR = sys.argv[2] | |
# Delete everything in the output folder and overwrite... (not recommended) | |
# if os.path.exists(OUTPUT_DIR): | |
# shutil.rmtree(OUTPUT_DIR) # DANGER! removes everything in the output folder | |
# Utility function to run system commands with error feedback | |
def execute(command): | |
process = subprocess.Popen( | |
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE | |
) | |
stdout, stderr = process.communicate() | |
result = process.returncode | |
if result != 0: | |
print("An error occurred while running {}".format(command)) | |
print("stdout: {}".format(stdout)) | |
print("stderr: {}".format(stderr)) | |
sys.exit(1) | |
return { | |
"result": result, | |
"stdout": stdout.decode("utf-8").split("\n"), | |
"stderr": stderr.decode("utf-8").split("\n"), | |
} | |
# Utility function to watch an output directory and generate a progress bar from file count | |
def prog(max): | |
max = max-3 # Ignore meta items | |
with alive_bar(max, manual=True, disable=QUIET) as bar: | |
while len(os.listdir(EXTRACT_DIR)) < max: | |
time.sleep(.005) | |
bar(len(os.listdir(EXTRACT_DIR))/max) | |
# Returned table of metadata from pdfimages -list | |
# See https://linuxcommandlibrary.com/man/pdfimages | |
metadata_parts = [ | |
"page", | |
"num", | |
"type", | |
"width", | |
"height", | |
"color", | |
"comp", | |
"bpc", | |
"enc", | |
"interop", | |
"object", | |
"id", | |
"x_ppi", | |
"y_ppi", | |
"size", | |
"ratio", | |
] | |
# Image data class | |
class PdfImageMetadata: | |
def __init__(self, text): | |
global metadata_parts | |
parts = text.split() | |
for meta in metadata_parts: | |
if len(parts) <= 0: | |
break | |
self.__setattr__(meta, parts.pop(0)) | |
self.num = int(self.num) | |
self.object = int(self.object) | |
pdf_objects = {} | |
log("Parse PDF image metadata") | |
command = f'pdfimages -list "{INPUT_PDF_FILE}"' | |
list_results = execute(command) # Generate the metadata for the pdf file | |
count = 0 | |
for line in list_results["stdout"]: | |
count += 1 | |
if count < 3: | |
continue | |
if len(line) <= 2: | |
continue | |
image = PdfImageMetadata(line) | |
if not "image" in image.type and not "smask" in image.type: | |
continue | |
if not image.object in pdf_objects: | |
pdf_objects[image.object] = {} | |
pdf_objects[image.object][image.type] = image | |
# Create a folder to extract to | |
EXTRACT_DIR = os.path.join(OUTPUT_DIR, "10-extract") | |
if not os.path.exists(EXTRACT_DIR): | |
os.makedirs(EXTRACT_DIR, exist_ok=True) # It's fine if there's already one | |
if not SKIP_EXTRACT: # Skip this step if needed | |
pool = Pool() # Create the world's most unnecessary multithreaded pool | |
log(f"Extract image data from PDF to [{EXTRACT_DIR}]") | |
log(f"This can take a long time. We expect [{count-3}] files in [{EXTRACT_DIR}] before continuing...") | |
command = f'pdfimages "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command to extract raw images | |
# command = f'pdfimages -png -tiff -j "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"' # Command that tolerates jp2; a significant performance hit and fidelity loss | |
pool.apply_async(prog, [count]) # Start the progress bar | |
pool.apply_async(execute, [command]).wait() # Start the extraction | |
time.sleep(.1) #Leave time for the bar to notice the final image | |
pool.terminate() # Kill the parallel pool | |
else: | |
log('Skipped image extraction') | |
# JP2 conversion if dependency installed | |
# log(f"Converting JP2 files, if present.") | |
# for path, dirc, files in os.walk(EXTRACT_DIR): | |
# for name in files: | |
# if name.endswith(".jp2"): | |
# # If this line is giving you an error see intro on JP2 support | |
# command = f'opj_decompress -i {EXTRACT_DIR}/{name} -o {EXTRACT_DIR}/{name.rsplit(".", 1)[0]}.ppm' | |
# execute(command) | |
# os.remove(f"{EXTRACT_DIR}/{name}") | |
# log(f"Converted JP2 files, if present.") | |
# Make a big list of the images we've got to work with | |
log("Gather extracted image paths") | |
extracted_images = {} | |
for root, dirs, files in os.walk(EXTRACT_DIR): | |
for ff in files: | |
image_num = int(ff.split("-")[1].split(".")[0]) | |
extracted_images[image_num] = os.path.join(root, ff) | |
# Create the various folders, and variables to refer to them | |
STANDALONE_DIR = os.path.join(OUTPUT_DIR, "40-standalone") | |
os.makedirs(STANDALONE_DIR, exist_ok=True) | |
MASKED_DIR = os.path.join(OUTPUT_DIR, "30-masked") | |
os.makedirs(MASKED_DIR, exist_ok=True) | |
SAMPLE_DIR = os.path.join(OUTPUT_DIR, "25-samples") | |
os.makedirs(SAMPLE_DIR, exist_ok=True) | |
ORGANIZE_DIR = os.path.join(OUTPUT_DIR, "15-organized") | |
os.makedirs(ORGANIZE_DIR, exist_ok=True) | |
RAW_MASK_DIR = os.path.join(ORGANIZE_DIR, "mask") | |
os.makedirs(RAW_MASK_DIR, exist_ok=True) | |
RAW_IMAGE_DIR = os.path.join(ORGANIZE_DIR, "image") | |
os.makedirs(RAW_IMAGE_DIR) | |
# Function to compose images and masks together | |
def compose(image, mask, destination, mode, imageWidth, imageHeight): | |
merged_dir = os.path.join(MASKED_DIR, mode) | |
os.makedirs(merged_dir, exist_ok=True) | |
merged_file = f"{destination:05d}.webp" # in webp | |
merged_path = os.path.join(merged_dir, merged_file) | |
# Imagemagick command with quality level, we also resize the mask to the same size as the image as some pdfs store lower size mask data, we also set the composition mode | |
command = f'convert "{image}" \( "{mask}" -resize {imageWidth}x{imageHeight}! \) -compose {mode} -composite -quality {QUALITY} "{merged_path}"' | |
execute(command) | |
if destination == SAMPLE_IMAGE_NUM: | |
sample_path = os.path.join(SAMPLE_DIR, f"{mode}-{destination:05d}.webp") | |
shutil.copy(merged_path, sample_path) # copy it over if this was a sample image | |
# Function to convert images without masks | |
def convert(image, destination): | |
merged_file = f"{destination:05d}.webp" # in webp | |
merged_path = os.path.join(STANDALONE_DIR, merged_file) | |
command = f'convert "{image}" -quality {QUALITY} "{merged_path}"' # Imagemagick command with quality level | |
execute(command) | |
# Work through the list of output files | |
log("Merging masked images, copying standalone images") | |
merged_count = 0 | |
standalone_count = 0 | |
images_counted = False | |
mode_count = 0 | |
for mode in COMPOSITIONS: | |
mode_count += 1 | |
log(f'Composing images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]') | |
with alive_bar(len(pdf_objects), disable=QUIET) as bar: # With a nice progress bar | |
for k, v in pdf_objects.items(): | |
if "smask" in v and "image" in v: # If a mask-image pair | |
image = extracted_images[v["image"].num] | |
mask = extracted_images[v["smask"].num] | |
shutil.copy(image, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # Copy the image | |
shutil.copy(mask, os.path.join(RAW_MASK_DIR, f"{v['smask'].num}.png")) # Copy the mask | |
compose( | |
image, | |
mask, | |
v["image"].num, # need to name it | |
mode, | |
v["image"].width, # need to resize mask | |
v["image"].height, | |
) | |
if not images_counted: | |
merged_count += 1 | |
bar() # Update progress bar | |
elif "image" in v: # If no mask | |
source = extracted_images[v["image"].num] | |
shutil.copy(source, os.path.join(RAW_IMAGE_DIR, f"{v['image'].num}.png")) # copy it | |
convert(source, v["image"].num) # Convert it to webp and save in dir | |
standalone_count += 1 | |
bar() # Update progress bar | |
images_counted = True | |
log(f"Raw images sorted in [{ORGANIZE_DIR}]") | |
log(f"{merged_count} images masked in [{MASKED_DIR}]") | |
log(f"{standalone_count} images with no mask converted to webp in [{STANDALONE_DIR}]") | |
if DEL_EXTRACT: # If we want raw images deleted (not recommended if issues) | |
shutil.rmtree(EXTRACT_DIR) # Remove the raw extracted files | |
log(f"Cleanup action removed raw files in [{EXTRACT_DIR}]") | |
log(f"Creating cover image in [{OUTPUT_DIR}]") | |
# Extract the front page of the pdf as an image | |
execute(f'pdftoppm -singlefile "{INPUT_PDF_FILE}" "{OUTPUT_DIR}/cover"') | |
# Covert it to webp at quality | |
execute(f'convert "{OUTPUT_DIR}/cover.ppm" -quality {QUALITY} {OUTPUT_DIR}/cover-full.webp && rm "{OUTPUT_DIR}/cover.ppm"') | |
# Create a 300x300px padded transparent thumbnail of the front page at full quality | |
execute(f'convert "{OUTPUT_DIR}/cover-full.webp" -quality 100 -resize 300x300 -background transparent -gravity center -extent 300x300 {OUTPUT_DIR}/cover.webp') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment