Last active
July 18, 2025 05:03
-
-
Save Filimoa/6bb345c5e7ec08a7ff46cefe648d04d1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| import openparse | |
| # directory for sample PDF's | |
| PDF_DIR = Path("../data") / "pdfs" | |
| # output dir | |
| GT_DIR = Path("../data") / "ground_truth" | |
| parser = openparse.DocumentParser() | |
| for pdf_path in PDF_DIR.glob("*.pdf"): | |
| parsed_doc = parser.parse(pdf_path) | |
| with open(GT_DIR / pdf_path.name.replace(".pdf", ".json"), "w") as f: | |
| f.write(parsed_doc.model_dump_json()) | |
| pdf = openparse.Pdf(pdf_path) | |
| ### === OPTIONAL: display nodes === ### | |
| # useful for filtering out "bad" nodes since open-parse is quite simple in how it parses | |
| for node in parsed_doc.nodes: | |
| print(node.text) | |
| pdf.display_with_bboxes([node], page_nums=[node.start_page]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment