Last active
July 8, 2025 18:55
-
-
Save stapelberg/45bd82dec9d299fc1cf481c07f0c051d to your computer and use it in GitHub Desktop.
Nix-runnable version of https://layout-parser.readthedocs.io/en/latest/example/deep_layout_parsing/index.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nodes": { | |
"nixpkgs": { | |
"locked": { | |
"lastModified": 1751477932, | |
"narHash": "sha256-mRbVq/Ht0Sr0kzt9wQ9IyXFIpeFz6DhZ+6if/HOcMFw=", | |
"owner": "NixOS", | |
"repo": "nixpkgs", | |
"rev": "51e8f407be7d7ba35321d9b43acf31fc25fe7cf1", | |
"type": "github" | |
}, | |
"original": { | |
"owner": "NixOS", | |
"repo": "nixpkgs", | |
"type": "github" | |
} | |
}, | |
"root": { | |
"inputs": { | |
"nixpkgs": "nixpkgs" | |
} | |
} | |
}, | |
"root": "root", | |
"version": 7 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
description = "nix-runnable version of layoutparser image-to-text"; | |
inputs.nixpkgs.url = "github:NixOS/nixpkgs"; | |
outputs = | |
{ self, nixpkgs }: | |
let | |
system = "x86_64-linux"; | |
pkgs = nixpkgs.legacyPackages.${system}; | |
pythonEnv = pkgs.python312.withPackages ( | |
ps: with ps; [ | |
opencv4 | |
layoutparser | |
detectron2 | |
torchvision | |
pytesseract | |
] | |
); | |
in | |
{ | |
devShells.${system}.default = pkgs.mkShell { | |
buildInputs = [ pythonEnv ]; | |
}; | |
packages.${system}.img2txt = pkgs.writeShellApplication { | |
name = "img2txt"; | |
runtimeInputs = [ pythonEnv ]; | |
text = '' | |
exec ${pythonEnv.interpreter} ${self}/img2txt.py "$@" | |
''; | |
}; | |
apps.${system}.img2txt = { | |
type = "app"; | |
program = "${self.packages.${system}.img2txt}/bin/img2txt"; | |
}; | |
}; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import layoutparser as lp | |
import cv2 | |
parser = argparse.ArgumentParser(description="Run layoutparser model on an image and extract text blocks.") | |
parser.add_argument("input", help="Path to the input image file") | |
parser.add_argument("--debug", action="store_true", help="Enable debug output and save intermediate image") | |
args = parser.parse_args() | |
# This program is largely the example from the documentation: | |
# https://layout-parser.readthedocs.io/en/latest/example/deep_layout_parsing/index.html | |
image = cv2.imread(args.input) | |
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', | |
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], | |
label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}) | |
layout = model.detect(image) | |
if args.debug: | |
lp.draw_box(image, layout, box_width=3).save("/tmp/debug.jpg") | |
text_blocks = lp.Layout([b for b in layout if b.type=='Text']) | |
ocr_agent = lp.TesseractAgent(languages='deu') | |
for block in text_blocks: | |
segment_image = (block | |
.pad(left=5, right=5, top=5, bottom=5) | |
.crop_image(image)) | |
# add padding in each image segment can help | |
# improve robustness | |
text = ocr_agent.detect(segment_image) | |
block.set(text=text, inplace=True) | |
for txt in text_blocks.get_texts(): | |
print(txt, end='\n---\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment