Last active
November 15, 2024 02:11
-
-
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import json | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser("Rewrite PDF outlines") | |
parser.add_argument("--json", | |
help="JSON file created by qpdf", | |
required=True) | |
parser.add_argument("--outline", | |
help="Your new outline file, in JSON format", | |
required=True) | |
parser.add_argument("--input", | |
help="Original input PDF file to update", | |
required=True) | |
parser.add_argument( | |
"--offset", | |
help="Page offset to add to each target in the outline JSON", | |
default=0) | |
args = parser.parse_args() | |
j = build_output_json(args.json, args.outline, args.offset) | |
json.dump(j, sys.stdout) | |
def build_output_json(json_fname: str, outline_fname: str, offset: int): | |
with open(json_fname) as f: | |
j = json.load(f) | |
with open(outline_fname) as f: | |
outline = json.load(f) | |
pages = [page["object"] for page in j["pages"]] | |
next_object_id = j["qpdf"][0]["maxobjectid"] + 1 | |
ids = ObjectIdAllocator(next_object_id) | |
catalog = get_catalog(j) | |
outlines_id = ids.next_id() | |
outlines = insert_new_object(j, outlines_id) | |
outlines["/Type"] = "/Outlines" | |
bookmarks = [] | |
for item in outline: | |
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids) | |
bookmarks.append(bookmark) | |
for ((id, bookmark), (next_id, | |
next_bookmark)) in zip(bookmarks, bookmarks[1:]): | |
bookmark["/Next"] = f"{next_id} 0 R" | |
next_bookmark["/Prev"] = f"{id} 0 R" | |
catalog["/Outlines"] = f"{outlines_id} 0 R" | |
first_id = bookmarks[0][0] | |
outlines["/First"] = f"{first_id} 0 R" | |
last_id = bookmarks[-1][0] | |
outlines["/Last"] = f"{last_id} 0 R" | |
return j | |
def get_catalog(j): | |
objs = j["qpdf"][1] | |
for (k, v) in objs.items(): | |
if not k.startswith("obj:"): | |
continue | |
if "value" not in v: | |
continue | |
v = v["value"] | |
if "/Type" not in v: | |
continue | |
if v["/Type"] == "/Catalog": | |
return v | |
raise Exception("could not find a PDF /Catalog") | |
def add_outline_item(j, pages, item, parent_id, offset: int, | |
ids: ObjectIdAllocator): | |
id = ids.next_id() | |
title = item["title"] | |
page_num = item["dest"] | |
page_ref = pages[page_num + offset] | |
bookmark = insert_new_object(j, id) | |
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None] | |
bookmark["/Parent"] = f"{parent_id} 0 R" | |
bookmark["/Title"] = f"u:{title}" | |
if "children" in item: | |
children = [] | |
for child in item["children"]: | |
bm = add_outline_item(j, pages, child, id, offset, ids) | |
children.append(bm) | |
for ((child_id, bm), (next_child_id, | |
next_bm)) in zip(children, children[1:]): | |
bm["/Next"] = f"{next_child_id} 0 R" | |
next_bm["/Prev"] = f"{child_id} 0 R" | |
first_id = children[0][0] | |
bookmark["/First"] = f"{first_id} 0 R" | |
last_id = children[-1][0] | |
bookmark["/Last"] = f"{last_id} 0 R" | |
return (id, bookmark) | |
def insert_new_object(j, id): | |
key = f"obj:{id} 0 R" | |
obj = {} | |
j["qpdf"][1][key] = obj | |
value = {} | |
obj["value"] = value | |
return value | |
class ObjectIdAllocator(): | |
def __init__(self, next_id: int): | |
self._next_id = next_id | |
def next_id(self): | |
id = self._next_id | |
self._next_id += 1 | |
return id | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clean up PDF for ingestion | |
qpdf --decrypt --object-streams=disable original.pdf in.pdf | |
# Create JSON dump of relevant metadata | |
qpdf --json in.pdf in.json | |
# Create outline JSON | |
vim outline.json | |
# Or, alternatively, create the outline as an indented text file and convert it to JSON | |
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '. >outline.txt | |
# Write outline data into JSON dump, overwriting old outline if any. | |
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json | |
# Write output JSON data into final PDF. | |
qpdf in.pdf out.pdf --update-from-json=out.json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Convert a text outline to JSON") | |
parser.add_argument("--increasing-page-numbers", action="store_true") | |
args = parser.parse_args() | |
render_entries(read_entries(args.increasing_page_numbers)) | |
def read_entries(validate_increasing_page_numbers): | |
prev_page = -1 | |
indent_stack = initialize_indent_stack() | |
for (line_number, line) in enumerate(sys.stdin): | |
space, title, page = parse_line(line) | |
if validate_increasing_page_numbers and page < prev_page: | |
raise Exception(f"decreasing page number at line {line_number}") | |
update_indent_stack(space, indent_stack, line_number) | |
depth = len(indent_stack) | |
yield (depth, title, page) | |
prev_page = page | |
WHITESPACE = re.compile(r"\s+") | |
def parse_line(line): | |
line = line.rstrip() | |
title, page = line.rsplit(maxsplit=1) | |
m = WHITESPACE.match(title) | |
space = "" | |
if m: | |
space = m.group(0) | |
# Does Python use utf-32? | |
title = title[len(space):] | |
page = int(page) | |
return (space, title, page) | |
def initialize_indent_stack(): | |
return [] | |
def update_indent_stack(space, indent_stack, line_number): | |
# We don't care _which_ characters are used to indent as long as they are | |
# consistent at each level. | |
if len(space) == 0: | |
# Zero out any existing indentation. | |
indent_stack.clear() | |
elif len(indent_stack) == 0: | |
# We have non-empty leading space but an empty stack, so this is the | |
# first level of indentation. | |
indent_stack.append(space) | |
else: | |
# We have a non-empty indentation stack _and_ non-empty leading space. | |
# We need to confirm that there's some level of shared prefix with | |
# existing indentation. | |
last_indent = indent_stack[-1] | |
if len(space) > len(last_indent): | |
# Deeper level of indentation than before. This is only valid if it | |
# starts with the pevious indentation characters. | |
if space.startswith(last_indent): | |
indent_stack.append(space) | |
else: | |
raise Exception(f"invalid indentation at line {line_number}") | |
else: | |
# This must correspond to a _shallower_ level of indentation than | |
# before and, moreover, must _exactly_ match one of our previous | |
# indentation levels. (It's not valid to de-indent to some previous | |
# depth and then re-indent to a deeper level with new space | |
# characters). | |
if not last_indent.startswith(space): | |
raise Exception( | |
f"invalid de-indentation at line {line_number}") | |
while len(last_indent) != len(space): | |
# NOTE: We rely on preconditions here to avoid more assertions | |
# about stack state. | |
indent_stack.pop() | |
last_indent = indent_stack[-1] | |
def render_entries(entries): | |
for entry in build_entries(pair_with_next_depth(entries), -1): | |
render_entry(entry) | |
def pair_with_next_depth(entries): | |
for (entry, | |
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2): | |
if next_entry is None: | |
yield (entry, None) | |
else: | |
yield (entry, next_entry[0]) | |
def sliding_window(iterator, n): | |
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n) | |
for x in iterator: | |
window.append(x) | |
yield tuple(window) | |
def build_entries(entries, break_depth): | |
for ((depth, title, page), next_depth) in entries: | |
entry = {"title": title, "dest": page} | |
if next_depth is not None and next_depth > depth: | |
entry["children"] = [ | |
child for child in build_entries(entries, depth) | |
] | |
yield entry | |
if next_depth is not None and next_depth <= break_depth: | |
return | |
def render_entry(entry): | |
json.dump(entry, sys.stdout) | |
if __name__ == "__main__": | |
main() |
Yes, the above outline should work. I've also just added a script to make it a bit easier to write outlines as plain text files with minimal structure. This should make it easy to directly type up outlines from TOCs, etc.
For example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
Index 6
The last token per line is interpreted as the destination page (zero-indexed, as in the standard JSON format).
Since it renders to JSON, you can also do various JSON transformations (e.g., handling conversion of front-matter pagination and main body pagination to different offsets).
There seems to be a problem tough when the indent dephs drops from 2 to 0
For Example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
2.1.1 Subsubchapter 6
Index 7
then the depth in the json drops only one level. I coundn't figure out where the error is, yet.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This works for me: