Skip to content

Instantly share code, notes, and snippets.

@gwpl
Last active March 7, 2025 17:06
Show Gist options
  • Save gwpl/aa26593b575c93c76178a1bcc2afa9eb to your computer and use it in GitHub Desktop.
Save gwpl/aa26593b575c93c76178a1bcc2afa9eb to your computer and use it in GitHub Desktop.
OCR with Tesseract including enhancing options leverages adaptive and simple thresholding to boost text extraction accuracy; employs preprocessing for grayscale conversion and bounding box details as differentiators; core features include user-configurable options for grayscale, thresholding, OCR engine, page segmentation, and language support; …
#!/usr/bin/env python3
# OCR with Tesseract including enhancing options leverages adaptive and
# simple thresholding to boost text extraction accuracy; employs
# preprocessing for grayscale conversion and bounding box details as
# differentiators; core features include user-configurable options for
# grayscale, thresholding, OCR engine, page segmentation, and language
# support; with additional functionalities like auto-preprocessing and
# bounding box extraction, this tool provides detailed text recognition
# data while ensuring ease of use through command-line interface.
#
# published:
# * https://github.com/CLIAI/handy_scripts/blob/main/ocr_tesseract.py
# * https://gist.github.com/gwpl/aa26593b575c93c76178a1bcc2afa9eb
#
# Keep sources:
# * https://x.com/i/grok/share/mbO9qX38FWwWq1CvRMvG4rpeJ
# * https://chatgpt.com/share/67c9e100-84d4-8007-b9ef-c44c419e7e13
# * https://www.phind.com/search/cm7xn42px00002v6s3gh7xg8e
import sys
import argparse
from PIL import Image
# Try to import required libraries with helpful error messages if they fail
try:
import cv2
except ImportError:
print("ERROR: import cv2 failed")
print("For ArchLinux users, you may want to install:")
print("# sudo pacman -S python-opencv opencv-samples opencv")
print("For other distributions, use your package manager or pip:")
print("# pip install opencv-python")
sys.exit(1)
try:
import pytesseract
except ImportError:
print("ERROR: import pytesseract failed")
print("For ArchLinux users, you may want to install:")
print("# sudo pacman -S python-pytesseract python-pyocr")
print("For other distributions, use your package manager or pip:")
print("# pip install pytesseract")
sys.exit(1)
def preprocess_image(image, grayscale=True, threshold=False, threshold_value=150, adaptive=False):
"""Preprocess the image with specified options."""
processed = image.copy()
if grayscale:
processed = cv2.cvtColor(processed, cv2.COLOR_BGR2GRAY)
if threshold:
if adaptive:
processed = cv2.adaptiveThreshold(processed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
else:
_, processed = cv2.threshold(processed, threshold_value, 255, cv2.THRESH_BINARY)
return processed
def perform_ocr(image_path, preprocess_options, tesseract_config, return_bounding_boxes=False):
"""Run OCR on the image with the given settings."""
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Unable to load image at {image_path}")
processed_image = preprocess_image(image, **preprocess_options)
pil_image = Image.fromarray(processed_image)
extracted_text = pytesseract.image_to_string(pil_image, config=tesseract_config)
output = {"text": extracted_text}
if return_bounding_boxes:
data = pytesseract.image_to_data(pil_image, config=tesseract_config, output_type=pytesseract.Output.DICT)
output["data"] = data
return output
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tesseract OCR with Preprocessing Options")
parser.add_argument("image_path", type=str, help="Path to the image file")
parser.add_argument("--grayscale", type=bool, default=True, help="Convert image to grayscale")
parser.add_argument("--threshold", type=bool, default=True, help="Apply simple thresholding")
parser.add_argument("--threshold_value", type=int, default=150, help="Threshold value for simple thresholding")
parser.add_argument("--adaptive", type=bool, default=False, help="Use adaptive thresholding")
parser.add_argument("--psm", type=int, default=3, help="Page segmentation mode for Tesseract")
parser.add_argument("--oem", type=int, default=3, help="OCR Engine mode for Tesseract")
parser.add_argument("--language", type=str, default="eng", help="Language for OCR")
parser.add_argument("--bounding_boxes", action="store_true", help="Return bounding box information")
parser.add_argument("--auto_preprocess", action="store_true", help="Automatically use adaptive thresholding")
args = parser.parse_args()
tesseract_config = f"--psm {args.psm} --oem {args.oem} -l {args.language}"
preprocess_options = {
"grayscale": args.grayscale,
"threshold": args.threshold,
"threshold_value": args.threshold_value,
"adaptive": args.adaptive or args.auto_preprocess
}
result = perform_ocr(args.image_path, preprocess_options, tesseract_config, args.bounding_boxes)
print("Extracted Text:", result["text"])
if args.bounding_boxes:
print("\nBounding Box Data:")
for i in range(len(result["data"]["text"])):
if result["data"]["text"][i].strip():
print(f"Text: '{result['data']['text'][i]}' | Box: [{result['data']['left'][i]}, {result['data']['top'][i]}, {result['data']['width'][i]}, {result['data']['height'][i]}] | Confidence: {result['data']['conf'][i]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment