Skip to content

Instantly share code, notes, and snippets.

@stapelberg
Last active July 8, 2025 18:55
Show Gist options
  • Save stapelberg/45bd82dec9d299fc1cf481c07f0c051d to your computer and use it in GitHub Desktop.
Save stapelberg/45bd82dec9d299fc1cf481c07f0c051d to your computer and use it in GitHub Desktop.
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1751477932,
"narHash": "sha256-mRbVq/Ht0Sr0kzt9wQ9IyXFIpeFz6DhZ+6if/HOcMFw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "51e8f407be7d7ba35321d9b43acf31fc25fe7cf1",
"type": "github"
},
"original": {
"owner": "NixOS",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}
{
description = "nix-runnable version of layoutparser image-to-text";
inputs.nixpkgs.url = "github:NixOS/nixpkgs";
outputs =
{ self, nixpkgs }:
let
system = "x86_64-linux";
pkgs = nixpkgs.legacyPackages.${system};
pythonEnv = pkgs.python312.withPackages (
ps: with ps; [
opencv4
layoutparser
detectron2
torchvision
pytesseract
]
);
in
{
devShells.${system}.default = pkgs.mkShell {
buildInputs = [ pythonEnv ];
};
packages.${system}.img2txt = pkgs.writeShellApplication {
name = "img2txt";
runtimeInputs = [ pythonEnv ];
text = ''
exec ${pythonEnv.interpreter} ${self}/img2txt.py "$@"
'';
};
apps.${system}.img2txt = {
type = "app";
program = "${self.packages.${system}.img2txt}/bin/img2txt";
};
};
}
#!/usr/bin/env python
import argparse
import layoutparser as lp
import cv2
parser = argparse.ArgumentParser(description="Run layoutparser model on an image and extract text blocks.")
parser.add_argument("input", help="Path to the input image file")
parser.add_argument("--debug", action="store_true", help="Enable debug output and save intermediate image")
args = parser.parse_args()
# This program is largely the example from the documentation:
# https://layout-parser.readthedocs.io/en/latest/example/deep_layout_parsing/index.html
image = cv2.imread(args.input)
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
layout = model.detect(image)
if args.debug:
lp.draw_box(image, layout, box_width=3).save("/tmp/debug.jpg")
text_blocks = lp.Layout([b for b in layout if b.type=='Text'])
ocr_agent = lp.TesseractAgent(languages='deu')
for block in text_blocks:
segment_image = (block
.pad(left=5, right=5, top=5, bottom=5)
.crop_image(image))
# add padding in each image segment can help
# improve robustness
text = ocr_agent.detect(segment_image)
block.set(text=text, inplace=True)
for txt in text_blocks.get_texts():
print(txt, end='\n---\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment