Skip to content

Instantly share code, notes, and snippets.

@bsidhom
Last active November 15, 2024 02:11
Show Gist options
  • Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import sys
def main():
parser = argparse.ArgumentParser("Rewrite PDF outlines")
parser.add_argument("--json",
help="JSON file created by qpdf",
required=True)
parser.add_argument("--outline",
help="Your new outline file, in JSON format",
required=True)
parser.add_argument("--input",
help="Original input PDF file to update",
required=True)
parser.add_argument(
"--offset",
help="Page offset to add to each target in the outline JSON",
default=0)
args = parser.parse_args()
j = build_output_json(args.json, args.outline, args.offset)
json.dump(j, sys.stdout)
def build_output_json(json_fname: str, outline_fname: str, offset: int):
with open(json_fname) as f:
j = json.load(f)
with open(outline_fname) as f:
outline = json.load(f)
pages = [page["object"] for page in j["pages"]]
next_object_id = j["qpdf"][0]["maxobjectid"] + 1
ids = ObjectIdAllocator(next_object_id)
catalog = get_catalog(j)
outlines_id = ids.next_id()
outlines = insert_new_object(j, outlines_id)
outlines["/Type"] = "/Outlines"
bookmarks = []
for item in outline:
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids)
bookmarks.append(bookmark)
for ((id, bookmark), (next_id,
next_bookmark)) in zip(bookmarks, bookmarks[1:]):
bookmark["/Next"] = f"{next_id} 0 R"
next_bookmark["/Prev"] = f"{id} 0 R"
catalog["/Outlines"] = f"{outlines_id} 0 R"
first_id = bookmarks[0][0]
outlines["/First"] = f"{first_id} 0 R"
last_id = bookmarks[-1][0]
outlines["/Last"] = f"{last_id} 0 R"
return j
def get_catalog(j):
objs = j["qpdf"][1]
for (k, v) in objs.items():
if not k.startswith("obj:"):
continue
if "value" not in v:
continue
v = v["value"]
if "/Type" not in v:
continue
if v["/Type"] == "/Catalog":
return v
raise Exception("could not find a PDF /Catalog")
def add_outline_item(j, pages, item, parent_id, offset: int,
ids: ObjectIdAllocator):
id = ids.next_id()
title = item["title"]
page_num = item["dest"]
page_ref = pages[page_num + offset]
bookmark = insert_new_object(j, id)
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None]
bookmark["/Parent"] = f"{parent_id} 0 R"
bookmark["/Title"] = f"u:{title}"
if "children" in item:
children = []
for child in item["children"]:
bm = add_outline_item(j, pages, child, id, offset, ids)
children.append(bm)
for ((child_id, bm), (next_child_id,
next_bm)) in zip(children, children[1:]):
bm["/Next"] = f"{next_child_id} 0 R"
next_bm["/Prev"] = f"{child_id} 0 R"
first_id = children[0][0]
bookmark["/First"] = f"{first_id} 0 R"
last_id = children[-1][0]
bookmark["/Last"] = f"{last_id} 0 R"
return (id, bookmark)
def insert_new_object(j, id):
key = f"obj:{id} 0 R"
obj = {}
j["qpdf"][1][key] = obj
value = {}
obj["value"] = value
return value
class ObjectIdAllocator():
def __init__(self, next_id: int):
self._next_id = next_id
def next_id(self):
id = self._next_id
self._next_id += 1
return id
if __name__ == "__main__":
main()
# Clean up PDF for ingestion
qpdf --decrypt --object-streams=disable original.pdf in.pdf
# Create JSON dump of relevant metadata
qpdf --json in.pdf in.json
# Create outline JSON
vim outline.json
# Or, alternatively, create the outline as an indented text file and convert it to JSON
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '. >outline.txt
# Write outline data into JSON dump, overwriting old outline if any.
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json
# Write output JSON data into final PDF.
qpdf in.pdf out.pdf --update-from-json=out.json
#!/usr/bin/env python3
import argparse
import collections
import itertools
import json
import re
import sys
def main():
parser = argparse.ArgumentParser(
description="Convert a text outline to JSON")
parser.add_argument("--increasing-page-numbers", action="store_true")
args = parser.parse_args()
render_entries(read_entries(args.increasing_page_numbers))
def read_entries(validate_increasing_page_numbers):
prev_page = -1
indent_stack = initialize_indent_stack()
for (line_number, line) in enumerate(sys.stdin):
space, title, page = parse_line(line)
if validate_increasing_page_numbers and page < prev_page:
raise Exception(f"decreasing page number at line {line_number}")
update_indent_stack(space, indent_stack, line_number)
depth = len(indent_stack)
yield (depth, title, page)
prev_page = page
WHITESPACE = re.compile(r"\s+")
def parse_line(line):
line = line.rstrip()
title, page = line.rsplit(maxsplit=1)
m = WHITESPACE.match(title)
space = ""
if m:
space = m.group(0)
# Does Python use utf-32?
title = title[len(space):]
page = int(page)
return (space, title, page)
def initialize_indent_stack():
return []
def update_indent_stack(space, indent_stack, line_number):
# We don't care _which_ characters are used to indent as long as they are
# consistent at each level.
if len(space) == 0:
# Zero out any existing indentation.
indent_stack.clear()
elif len(indent_stack) == 0:
# We have non-empty leading space but an empty stack, so this is the
# first level of indentation.
indent_stack.append(space)
else:
# We have a non-empty indentation stack _and_ non-empty leading space.
# We need to confirm that there's some level of shared prefix with
# existing indentation.
last_indent = indent_stack[-1]
if len(space) > len(last_indent):
# Deeper level of indentation than before. This is only valid if it
# starts with the pevious indentation characters.
if space.startswith(last_indent):
indent_stack.append(space)
else:
raise Exception(f"invalid indentation at line {line_number}")
else:
# This must correspond to a _shallower_ level of indentation than
# before and, moreover, must _exactly_ match one of our previous
# indentation levels. (It's not valid to de-indent to some previous
# depth and then re-indent to a deeper level with new space
# characters).
if not last_indent.startswith(space):
raise Exception(
f"invalid de-indentation at line {line_number}")
while len(last_indent) != len(space):
# NOTE: We rely on preconditions here to avoid more assertions
# about stack state.
indent_stack.pop()
last_indent = indent_stack[-1]
def render_entries(entries):
for entry in build_entries(pair_with_next_depth(entries), -1):
render_entry(entry)
def pair_with_next_depth(entries):
for (entry,
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2):
if next_entry is None:
yield (entry, None)
else:
yield (entry, next_entry[0])
def sliding_window(iterator, n):
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n)
for x in iterator:
window.append(x)
yield tuple(window)
def build_entries(entries, break_depth):
for ((depth, title, page), next_depth) in entries:
entry = {"title": title, "dest": page}
if next_depth is not None and next_depth > depth:
entry["children"] = [
child for child in build_entries(entries, depth)
]
yield entry
if next_depth is not None and next_depth <= break_depth:
return
def render_entry(entry):
json.dump(entry, sys.stdout)
if __name__ == "__main__":
main()
@foolishgrunt
Copy link

Do you have an example outline.json file you could share?

@qooxzuub
Copy link

Do you have an example outline.json file you could share?

This works for me:

[
  {
    "title": "First chapter",
    "dest": 0,
    "children": [
      {
        "title": "Subsection one point one",
        "dest": 1
      },
      {
        "title": "Subsection one point two",
        "dest": 2
      }
    ]
  },
  {
    "title": "Second chapter",
    "dest": 3
  }
]

@bsidhom
Copy link
Author

bsidhom commented Sep 20, 2024

Yes, the above outline should work. I've also just added a script to make it a bit easier to write outlines as plain text files with minimal structure. This should make it easy to directly type up outlines from TOCs, etc.

For example:

Contents 0
1. Chapter 1 1
  1.1. Subchapter 1.1. 2
  1.2 Subchapter 1.2 3
2. Chapter 2 4
  2.1 Subchapter 2.1 5
Index 6

The last token per line is interpreted as the destination page (zero-indexed, as in the standard JSON format).

Since it renders to JSON, you can also do various JSON transformations (e.g., handling conversion of front-matter pagination and main body pagination to different offsets).

@Nighel123
Copy link

There seems to be a problem tough when the indent dephs drops from 2 to 0

For Example:

Contents 0
1. Chapter 1 1
  1.1. Subchapter 1.1. 2
  1.2 Subchapter 1.2 3
2. Chapter 2 4
  2.1 Subchapter 2.1 5
    2.1.1 Subsubchapter 6
Index 7

then the depth in the json drops only one level. I coundn't figure out where the error is, yet.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment