Skip to content

Instantly share code, notes, and snippets.

@gretel
Created May 14, 2025 10:27
Show Gist options
  • Save gretel/3ede39f57e3850e584ed90a5e8089df2 to your computer and use it in GitHub Desktop.
Save gretel/3ede39f57e3850e584ed90a5e8089df2 to your computer and use it in GitHub Desktop.
pdf2json_kreuzberg.py
#!/usr/bin/env python3
"""
pdf2json - Convert PDF documents to structured JSON using Kreuzberg
Usage: pdf2json input.pdf [output.json]
"""
import sys
import os
import json
import asyncio
import argparse
from typing import Dict, Any, Optional
# Import Kreuzberg for document extraction
from kreuzberg import extract_file
async def extract_pdf_to_json(pdf_path: str) -> Dict[str, Any]:
"""Extract PDF content to structured JSON format using Kreuzberg."""
# Extract content using Kreuzberg
result = await extract_file(pdf_path)
# Build structured JSON output
output = {
"metadata": result.metadata,
"content": result.content,
"mime_type": result.mime_type
}
# Add tables if available
if hasattr(result, "tables") and result.tables:
output["tables"] = [
{
"rows": [[cell for cell in row] for row in table.data],
"page": table.page
}
for table in result.tables
]
# Add pages if available (for PDFs)
pages = []
if hasattr(result, "pages") and result.pages:
for i, page in enumerate(result.pages):
page_data = {
"page_number": i + 1,
"content": page.content
}
pages.append(page_data)
output["pages"] = pages
return output
async def process_file(input_path: str, output_path: Optional[str], pretty: bool) -> None:
"""Process a single PDF file and convert it to JSON."""
try:
json_data = await extract_pdf_to_json(input_path)
# Set default output filename if not provided
if not output_path:
base_name = os.path.splitext(input_path)[0]
output_path = f"{base_name}.json"
# Write output
with open(output_path, "w", encoding="utf-8") as f:
if pretty:
json.dump(json_data, f, indent=2, ensure_ascii=False)
else:
json.dump(json_data, f, ensure_ascii=False)
print(f"Converted '{input_path}' to '{output_path}'")
except Exception as e:
print(f"Error converting PDF: {e}", file=sys.stderr)
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Convert PDF to structured JSON using Kreuzberg")
parser.add_argument("input_pdf", help="Input PDF file path")
parser.add_argument("output_json", nargs="?", help="Output JSON file path (defaults to input filename with .json extension)")
parser.add_argument("-p", "--pretty", action="store_true", help="Pretty print JSON output")
args = parser.parse_args()
# Validate input file
if not os.path.isfile(args.input_pdf):
print(f"Error: Input file '{args.input_pdf}' does not exist", file=sys.stderr)
sys.exit(1)
# Run async extraction process
asyncio.run(process_file(args.input_pdf, args.output_json, args.pretty))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment