Created
May 14, 2025 10:27
-
-
Save gretel/3ede39f57e3850e584ed90a5e8089df2 to your computer and use it in GitHub Desktop.
pdf2json_kreuzberg.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
pdf2json - Convert PDF documents to structured JSON using Kreuzberg | |
Usage: pdf2json input.pdf [output.json] | |
""" | |
import sys | |
import os | |
import json | |
import asyncio | |
import argparse | |
from typing import Dict, Any, Optional | |
# Import Kreuzberg for document extraction | |
from kreuzberg import extract_file | |
async def extract_pdf_to_json(pdf_path: str) -> Dict[str, Any]: | |
"""Extract PDF content to structured JSON format using Kreuzberg.""" | |
# Extract content using Kreuzberg | |
result = await extract_file(pdf_path) | |
# Build structured JSON output | |
output = { | |
"metadata": result.metadata, | |
"content": result.content, | |
"mime_type": result.mime_type | |
} | |
# Add tables if available | |
if hasattr(result, "tables") and result.tables: | |
output["tables"] = [ | |
{ | |
"rows": [[cell for cell in row] for row in table.data], | |
"page": table.page | |
} | |
for table in result.tables | |
] | |
# Add pages if available (for PDFs) | |
pages = [] | |
if hasattr(result, "pages") and result.pages: | |
for i, page in enumerate(result.pages): | |
page_data = { | |
"page_number": i + 1, | |
"content": page.content | |
} | |
pages.append(page_data) | |
output["pages"] = pages | |
return output | |
async def process_file(input_path: str, output_path: Optional[str], pretty: bool) -> None: | |
"""Process a single PDF file and convert it to JSON.""" | |
try: | |
json_data = await extract_pdf_to_json(input_path) | |
# Set default output filename if not provided | |
if not output_path: | |
base_name = os.path.splitext(input_path)[0] | |
output_path = f"{base_name}.json" | |
# Write output | |
with open(output_path, "w", encoding="utf-8") as f: | |
if pretty: | |
json.dump(json_data, f, indent=2, ensure_ascii=False) | |
else: | |
json.dump(json_data, f, ensure_ascii=False) | |
print(f"Converted '{input_path}' to '{output_path}'") | |
except Exception as e: | |
print(f"Error converting PDF: {e}", file=sys.stderr) | |
sys.exit(1) | |
def main(): | |
parser = argparse.ArgumentParser(description="Convert PDF to structured JSON using Kreuzberg") | |
parser.add_argument("input_pdf", help="Input PDF file path") | |
parser.add_argument("output_json", nargs="?", help="Output JSON file path (defaults to input filename with .json extension)") | |
parser.add_argument("-p", "--pretty", action="store_true", help="Pretty print JSON output") | |
args = parser.parse_args() | |
# Validate input file | |
if not os.path.isfile(args.input_pdf): | |
print(f"Error: Input file '{args.input_pdf}' does not exist", file=sys.stderr) | |
sys.exit(1) | |
# Run async extraction process | |
asyncio.run(process_file(args.input_pdf, args.output_json, args.pretty)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment