stapelberg · July 8, 2025 18:55
diff --git a/flake.lock b/flake.lock
 {
  "nodes": {
    "nixpkgs": {
      "locked": {
        "lastModified": 1751477932,
        "narHash": "sha256-mRbVq/Ht0Sr0kzt9wQ9IyXFIpeFz6DhZ+6if/HOcMFw=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "51e8f407be7d7ba35321d9b43acf31fc25fe7cf1",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
diff --git a/flake.nix b/flake.nix
 {
  description = "nix-runnable version of layoutparser image-to-text";

  inputs.nixpkgs.url = "github:NixOS/nixpkgs";

  outputs =
    { self, nixpkgs }:
    let
      system = "x86_64-linux";
      pkgs = nixpkgs.legacyPackages.${system};
      pythonEnv = pkgs.python312.withPackages (
        ps: with ps; [
          opencv4
          layoutparser
          detectron2
          torchvision
          pytesseract
        ]
      );
    in
    {
      devShells.${system}.default = pkgs.mkShell {
        buildInputs = [ pythonEnv ];
      };

      packages.${system}.img2txt = pkgs.writeShellApplication {
        name = "img2txt";
        runtimeInputs = [ pythonEnv ];
        text = ''
          exec ${pythonEnv.interpreter} ${self}/img2txt.py "$@"
        '';
      };

      apps.${system}.img2txt = {
        type = "app";
        program = "${self.packages.${system}.img2txt}/bin/img2txt";
      };
    };
 }
diff --git a/img2txt.py b/img2txt.py
 #!/usr/bin/env python

 import argparse
 import layoutparser as lp
 import cv2

 parser = argparse.ArgumentParser(description="Run layoutparser model on an image and extract text blocks.")
 parser.add_argument("input", help="Path to the input image file")
 parser.add_argument("--debug", action="store_true", help="Enable debug output and save intermediate image")
 args = parser.parse_args()

 # This program is largely the example from the documentation:
 # https://layout-parser.readthedocs.io/en/latest/example/deep_layout_parsing/index.html

 image = cv2.imread(args.input)
 model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
 layout = model.detect(image)

 if args.debug:
    lp.draw_box(image, layout, box_width=3).save("/tmp/debug.jpg")

 text_blocks = lp.Layout([b for b in layout if b.type=='Text'])

 ocr_agent = lp.TesseractAgent(languages='deu')

 for block in text_blocks:
    segment_image = (block
                       .pad(left=5, right=5, top=5, bottom=5)
                       .crop_image(image))
        # add padding in each image segment can help
        # improve robustness

    text = ocr_agent.detect(segment_image)
    block.set(text=text, inplace=True)

 for txt in text_blocks.get_texts():
    print(txt, end='\n---\n')
	{
	"nodes": {
	"nixpkgs": {
	"locked": {
	"lastModified": 1751477932,
	"narHash": "sha256-mRbVq/Ht0Sr0kzt9wQ9IyXFIpeFz6DhZ+6if/HOcMFw=",
	"owner": "NixOS",
	"repo": "nixpkgs",
	"rev": "51e8f407be7d7ba35321d9b43acf31fc25fe7cf1",
	"type": "github"
	},
	"original": {
	"owner": "NixOS",
	"repo": "nixpkgs",
	"type": "github"
	}
	},
	"root": {
	"inputs": {
	"nixpkgs": "nixpkgs"
	}
	}
	},
	"root": "root",
	"version": 7
	}
	{
	description = "nix-runnable version of layoutparser image-to-text";

	inputs.nixpkgs.url = "github:NixOS/nixpkgs";

	outputs =
	{ self, nixpkgs }:
	let
	system = "x86_64-linux";
	pkgs = nixpkgs.legacyPackages.${system};
	pythonEnv = pkgs.python312.withPackages (
	ps: with ps; [
	opencv4
	layoutparser
	detectron2
	torchvision
	pytesseract
	]
	);
	in
	{
	devShells.${system}.default = pkgs.mkShell {
	buildInputs = [ pythonEnv ];
	};

	packages.${system}.img2txt = pkgs.writeShellApplication {
	name = "img2txt";
	runtimeInputs = [ pythonEnv ];
	text = ''
	exec ${pythonEnv.interpreter} ${self}/img2txt.py "$@"
	'';
	};

	apps.${system}.img2txt = {
	type = "app";
	program = "${self.packages.${system}.img2txt}/bin/img2txt";
	};
	};
	}
	#!/usr/bin/env python

	import argparse
	import layoutparser as lp
	import cv2

	parser = argparse.ArgumentParser(description="Run layoutparser model on an image and extract text blocks.")
	parser.add_argument("input", help="Path to the input image file")
	parser.add_argument("--debug", action="store_true", help="Enable debug output and save intermediate image")
	args = parser.parse_args()

	# This program is largely the example from the documentation:
	# https://layout-parser.readthedocs.io/en/latest/example/deep_layout_parsing/index.html

	image = cv2.imread(args.input)
	model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
	label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
	layout = model.detect(image)

	if args.debug:
	lp.draw_box(image, layout, box_width=3).save("/tmp/debug.jpg")

	text_blocks = lp.Layout([b for b in layout if b.type=='Text'])

	ocr_agent = lp.TesseractAgent(languages='deu')

	for block in text_blocks:
	segment_image = (block
	.pad(left=5, right=5, top=5, bottom=5)
	.crop_image(image))
	# add padding in each image segment can help
	# improve robustness

	text = ocr_agent.detect(segment_image)
	block.set(text=text, inplace=True)

	for txt in text_blocks.get_texts():
	print(txt, end='\n---\n')