Last active
May 14, 2025 10:26
-
-
Save gretel/e8c0227164133981c460931986088eba to your computer and use it in GitHub Desktop.
Convert PDF documents to structured JSON (using PyMuPDF)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
pdf2json - Convert PDF documents to structured JSON | |
Usage: pdf2json input.pdf [output.json] | |
""" | |
import sys | |
import os | |
import json | |
import argparse | |
import fitz # PyMuPDF | |
def extract_pdf_to_json(pdf_path, include_images=False): | |
"""Extract PDF content to structured JSON format.""" | |
doc = fitz.open(pdf_path) | |
result = { | |
"metadata": { | |
"title": doc.metadata.get("title", ""), | |
"author": doc.metadata.get("author", ""), | |
"creator": doc.metadata.get("creator", ""), | |
"producer": doc.metadata.get("producer", ""), | |
"subject": doc.metadata.get("subject", ""), | |
"keywords": doc.metadata.get("keywords", ""), | |
"pageCount": doc.page_count | |
}, | |
"pages": [] | |
} | |
for page_num, page in enumerate(doc): | |
page_data = { | |
"page_number": page_num + 1, | |
"sections": [] | |
} | |
# Extract text blocks | |
blocks = page.get_text("dict")["blocks"] | |
for block in blocks: | |
# Process text blocks | |
if block["type"] == 0: # Text block | |
block_text = "" | |
is_bold = False | |
font_size = 0 | |
# Extract text and font information | |
for line in block["lines"]: | |
for span in line["spans"]: | |
block_text += span["text"] + " " | |
# Check for bold text (flags & 16) | |
if span.get("flags", 0) & 16: | |
is_bold = True | |
# Track largest font size | |
font_size = max(font_size, span.get("size", 0)) | |
block_text = block_text.strip() | |
if not block_text: | |
continue | |
# Determine text type based on font attributes | |
text_type = "paragraph" | |
if font_size > 14 or (font_size > 11 and is_bold): | |
text_type = "heading" | |
elif font_size < 9: | |
text_type = "footnote" | |
page_data["sections"].append({ | |
"type": text_type, | |
"content": block_text, | |
"bbox": block["bbox"] # [x0, y0, x1, y1] | |
}) | |
# Handle images if requested | |
if include_images: | |
image_list = page.get_images(full=True) | |
if image_list: | |
page_data["images"] = [] | |
for img_index, img_info in enumerate(image_list): | |
xref = img_info[0] | |
base_image = doc.extract_image(xref) | |
if base_image: | |
# Just track image metadata (not binary data) | |
page_data["images"].append({ | |
"index": img_index, | |
"format": base_image["ext"], | |
"width": base_image["width"], | |
"height": base_image["height"] | |
}) | |
result["pages"].append(page_data) | |
doc.close() | |
return result | |
def main(): | |
parser = argparse.ArgumentParser(description="Convert PDF to structured JSON") | |
parser.add_argument("input_pdf", help="Input PDF file path") | |
parser.add_argument("output_json", nargs="?", help="Output JSON file path (defaults to input filename with .json extension)") | |
parser.add_argument("-p", "--pretty", action="store_true", help="Pretty print JSON output") | |
parser.add_argument("-i", "--images", action="store_true", help="Include image metadata") | |
args = parser.parse_args() | |
# Validate input file | |
if not os.path.isfile(args.input_pdf): | |
print(f"Error: Input file '{args.input_pdf}' does not exist", file=sys.stderr) | |
sys.exit(1) | |
# Set default output filename if not provided | |
output_path = args.output_json | |
if not output_path: | |
base_name = os.path.splitext(args.input_pdf)[0] | |
output_path = f"{base_name}.json" | |
try: | |
json_data = extract_pdf_to_json(args.input_pdf, include_images=args.images) | |
# Write output | |
with open(output_path, "w", encoding="utf-8") as f: | |
if args.pretty: | |
json.dump(json_data, f, indent=2, ensure_ascii=False) | |
else: | |
json.dump(json_data, f, ensure_ascii=False) | |
print(f"Converted '{args.input_pdf}' to '{output_path}'") | |
except Exception as e: | |
print(f"Error converting PDF: {e}", file=sys.stderr) | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment