Skip to content

Instantly share code, notes, and snippets.

@gretel
Last active May 14, 2025 10:26
Show Gist options
  • Save gretel/e8c0227164133981c460931986088eba to your computer and use it in GitHub Desktop.
Save gretel/e8c0227164133981c460931986088eba to your computer and use it in GitHub Desktop.
Convert PDF documents to structured JSON (using PyMuPDF)
#!/usr/bin/env python3
"""
pdf2json - Convert PDF documents to structured JSON
Usage: pdf2json input.pdf [output.json]
"""
import sys
import os
import json
import argparse
import fitz # PyMuPDF
def extract_pdf_to_json(pdf_path, include_images=False):
"""Extract PDF content to structured JSON format."""
doc = fitz.open(pdf_path)
result = {
"metadata": {
"title": doc.metadata.get("title", ""),
"author": doc.metadata.get("author", ""),
"creator": doc.metadata.get("creator", ""),
"producer": doc.metadata.get("producer", ""),
"subject": doc.metadata.get("subject", ""),
"keywords": doc.metadata.get("keywords", ""),
"pageCount": doc.page_count
},
"pages": []
}
for page_num, page in enumerate(doc):
page_data = {
"page_number": page_num + 1,
"sections": []
}
# Extract text blocks
blocks = page.get_text("dict")["blocks"]
for block in blocks:
# Process text blocks
if block["type"] == 0: # Text block
block_text = ""
is_bold = False
font_size = 0
# Extract text and font information
for line in block["lines"]:
for span in line["spans"]:
block_text += span["text"] + " "
# Check for bold text (flags & 16)
if span.get("flags", 0) & 16:
is_bold = True
# Track largest font size
font_size = max(font_size, span.get("size", 0))
block_text = block_text.strip()
if not block_text:
continue
# Determine text type based on font attributes
text_type = "paragraph"
if font_size > 14 or (font_size > 11 and is_bold):
text_type = "heading"
elif font_size < 9:
text_type = "footnote"
page_data["sections"].append({
"type": text_type,
"content": block_text,
"bbox": block["bbox"] # [x0, y0, x1, y1]
})
# Handle images if requested
if include_images:
image_list = page.get_images(full=True)
if image_list:
page_data["images"] = []
for img_index, img_info in enumerate(image_list):
xref = img_info[0]
base_image = doc.extract_image(xref)
if base_image:
# Just track image metadata (not binary data)
page_data["images"].append({
"index": img_index,
"format": base_image["ext"],
"width": base_image["width"],
"height": base_image["height"]
})
result["pages"].append(page_data)
doc.close()
return result
def main():
parser = argparse.ArgumentParser(description="Convert PDF to structured JSON")
parser.add_argument("input_pdf", help="Input PDF file path")
parser.add_argument("output_json", nargs="?", help="Output JSON file path (defaults to input filename with .json extension)")
parser.add_argument("-p", "--pretty", action="store_true", help="Pretty print JSON output")
parser.add_argument("-i", "--images", action="store_true", help="Include image metadata")
args = parser.parse_args()
# Validate input file
if not os.path.isfile(args.input_pdf):
print(f"Error: Input file '{args.input_pdf}' does not exist", file=sys.stderr)
sys.exit(1)
# Set default output filename if not provided
output_path = args.output_json
if not output_path:
base_name = os.path.splitext(args.input_pdf)[0]
output_path = f"{base_name}.json"
try:
json_data = extract_pdf_to_json(args.input_pdf, include_images=args.images)
# Write output
with open(output_path, "w", encoding="utf-8") as f:
if args.pretty:
json.dump(json_data, f, indent=2, ensure_ascii=False)
else:
json.dump(json_data, f, ensure_ascii=False)
print(f"Converted '{args.input_pdf}' to '{output_path}'")
except Exception as e:
print(f"Error converting PDF: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment