-
-
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import json | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser("Rewrite PDF outlines") | |
parser.add_argument("--json", | |
help="JSON file created by qpdf", | |
required=True) | |
parser.add_argument("--outline", | |
help="Your new outline file, in JSON format", | |
required=True) | |
parser.add_argument("--input", | |
help="Original input PDF file to update", | |
required=True) | |
parser.add_argument( | |
"--offset", | |
help="Page offset to add to each target in the outline JSON", | |
default=0) | |
args = parser.parse_args() | |
j = build_output_json(args.json, args.outline, args.offset) | |
json.dump(j, sys.stdout) | |
def build_output_json(json_fname: str, outline_fname: str, offset: int): | |
with open(json_fname) as f: | |
j = json.load(f) | |
with open(outline_fname) as f: | |
outline = json.load(f) | |
pages = [page["object"] for page in j["pages"]] | |
next_object_id = j["qpdf"][0]["maxobjectid"] + 1 | |
ids = ObjectIdAllocator(next_object_id) | |
catalog = get_catalog(j) | |
outlines_id = ids.next_id() | |
outlines = insert_new_object(j, outlines_id) | |
outlines["/Type"] = "/Outlines" | |
bookmarks = [] | |
for item in outline: | |
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids) | |
bookmarks.append(bookmark) | |
for ((id, bookmark), (next_id, | |
next_bookmark)) in zip(bookmarks, bookmarks[1:]): | |
bookmark["/Next"] = f"{next_id} 0 R" | |
next_bookmark["/Prev"] = f"{id} 0 R" | |
catalog["/Outlines"] = f"{outlines_id} 0 R" | |
first_id = bookmarks[0][0] | |
outlines["/First"] = f"{first_id} 0 R" | |
last_id = bookmarks[-1][0] | |
outlines["/Last"] = f"{last_id} 0 R" | |
return j | |
def get_catalog(j): | |
objs = j["qpdf"][1] | |
for (k, v) in objs.items(): | |
if not k.startswith("obj:"): | |
continue | |
if "value" not in v: | |
continue | |
v = v["value"] | |
if "/Type" not in v: | |
continue | |
if v["/Type"] == "/Catalog": | |
return v | |
raise Exception("could not find a PDF /Catalog") | |
def add_outline_item(j, pages, item, parent_id, offset: int, | |
ids: ObjectIdAllocator): | |
id = ids.next_id() | |
title = item["title"] | |
page_num = item["dest"] | |
page_ref = pages[page_num + offset] | |
bookmark = insert_new_object(j, id) | |
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None] | |
bookmark["/Parent"] = f"{parent_id} 0 R" | |
bookmark["/Title"] = f"u:{title}" | |
if "children" in item: | |
children = [] | |
for child in item["children"]: | |
bm = add_outline_item(j, pages, child, id, offset, ids) | |
children.append(bm) | |
for ((child_id, bm), (next_child_id, | |
next_bm)) in zip(children, children[1:]): | |
bm["/Next"] = f"{next_child_id} 0 R" | |
next_bm["/Prev"] = f"{child_id} 0 R" | |
first_id = children[0][0] | |
bookmark["/First"] = f"{first_id} 0 R" | |
last_id = children[-1][0] | |
bookmark["/Last"] = f"{last_id} 0 R" | |
return (id, bookmark) | |
def insert_new_object(j, id): | |
key = f"obj:{id} 0 R" | |
obj = {} | |
j["qpdf"][1][key] = obj | |
value = {} | |
obj["value"] = value | |
return value | |
class ObjectIdAllocator(): | |
def __init__(self, next_id: int): | |
self._next_id = next_id | |
def next_id(self): | |
id = self._next_id | |
self._next_id += 1 | |
return id | |
if __name__ == "__main__": | |
main() |
# Clean up PDF for ingestion | |
qpdf --decrypt --object-streams=disable original.pdf in.pdf | |
# Create JSON dump of relevant metadata | |
qpdf --json in.pdf in.json | |
# Create outline JSON | |
vim outline.json | |
# Or, alternatively, create the outline as an indented text file and convert it to JSON | |
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '. >outline.txt | |
# Write outline data into JSON dump, overwriting old outline if any. | |
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json | |
# Write output JSON data into final PDF. | |
qpdf in.pdf out.pdf --update-from-json=out.json |
#!/usr/bin/env python3 | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Convert a text outline to JSON") | |
parser.add_argument("--increasing-page-numbers", action="store_true") | |
args = parser.parse_args() | |
render_entries(read_entries(args.increasing_page_numbers)) | |
def read_entries(validate_increasing_page_numbers): | |
prev_page = -1 | |
indent_stack = initialize_indent_stack() | |
for (line_number, line) in enumerate(sys.stdin): | |
space, title, page = parse_line(line) | |
if validate_increasing_page_numbers and page < prev_page: | |
raise Exception(f"decreasing page number at line {line_number}") | |
update_indent_stack(space, indent_stack, line_number) | |
depth = len(indent_stack) | |
yield (depth, title, page) | |
prev_page = page | |
WHITESPACE = re.compile(r"\s+") | |
def parse_line(line): | |
line = line.rstrip() | |
title, page = line.rsplit(maxsplit=1) | |
m = WHITESPACE.match(title) | |
space = "" | |
if m: | |
space = m.group(0) | |
# Does Python use utf-32? | |
title = title[len(space):] | |
page = int(page) | |
return (space, title, page) | |
def initialize_indent_stack(): | |
return [] | |
def update_indent_stack(space, indent_stack, line_number): | |
# We don't care _which_ characters are used to indent as long as they are | |
# consistent at each level. | |
if len(space) == 0: | |
# Zero out any existing indentation. | |
indent_stack.clear() | |
elif len(indent_stack) == 0: | |
# We have non-empty leading space but an empty stack, so this is the | |
# first level of indentation. | |
indent_stack.append(space) | |
else: | |
# We have a non-empty indentation stack _and_ non-empty leading space. | |
# We need to confirm that there's some level of shared prefix with | |
# existing indentation. | |
last_indent = indent_stack[-1] | |
if len(space) > len(last_indent): | |
# Deeper level of indentation than before. This is only valid if it | |
# starts with the pevious indentation characters. | |
if space.startswith(last_indent): | |
indent_stack.append(space) | |
else: | |
raise Exception(f"invalid indentation at line {line_number}") | |
else: | |
# This must correspond to a _shallower_ level of indentation than | |
# before and, moreover, must _exactly_ match one of our previous | |
# indentation levels. (It's not valid to de-indent to some previous | |
# depth and then re-indent to a deeper level with new space | |
# characters). | |
if not last_indent.startswith(space): | |
raise Exception( | |
f"invalid de-indentation at line {line_number}") | |
while len(last_indent) != len(space): | |
# NOTE: We rely on preconditions here to avoid more assertions | |
# about stack state. | |
indent_stack.pop() | |
last_indent = indent_stack[-1] | |
def render_entries(entries): | |
for entry in build_entries(pair_with_next_depth(entries), -1): | |
render_entry(entry) | |
def pair_with_next_depth(entries): | |
for (entry, | |
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2): | |
if next_entry is None: | |
yield (entry, None) | |
else: | |
yield (entry, next_entry[0]) | |
def sliding_window(iterator, n): | |
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n) | |
for x in iterator: | |
window.append(x) | |
yield tuple(window) | |
def build_entries(entries, break_depth): | |
for ((depth, title, page), next_depth) in entries: | |
entry = {"title": title, "dest": page} | |
if next_depth is not None and next_depth > depth: | |
entry["children"] = [ | |
child for child in build_entries(entries, depth) | |
] | |
yield entry | |
if next_depth is not None and next_depth <= break_depth: | |
return | |
def render_entry(entry): | |
json.dump(entry, sys.stdout) | |
if __name__ == "__main__": | |
main() |
Do you have an example
outline.json
file you could share?
This works for me:
[
{
"title": "First chapter",
"dest": 0,
"children": [
{
"title": "Subsection one point one",
"dest": 1
},
{
"title": "Subsection one point two",
"dest": 2
}
]
},
{
"title": "Second chapter",
"dest": 3
}
]
Yes, the above outline should work. I've also just added a script to make it a bit easier to write outlines as plain text files with minimal structure. This should make it easy to directly type up outlines from TOCs, etc.
For example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
Index 6
The last token per line is interpreted as the destination page (zero-indexed, as in the standard JSON format).
Since it renders to JSON, you can also do various JSON transformations (e.g., handling conversion of front-matter pagination and main body pagination to different offsets).
There seems to be a problem tough when the indent dephs drops from 2 to 0
For Example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
2.1.1 Subsubchapter 6
Index 7
then the depth in the json drops only one level. I coundn't figure out where the error is, yet.
Do you have an example
outline.json
file you could share?