Last active
September 21, 2023 04:50
-
-
Save farithadnan/7266e9c75bae1516defa5d170239fa22 to your computer and use it in GitHub Desktop.
How to use extract text from multiple image using Tesseract OCR in C#?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// Method to extract text from an image. | |
/// </summary> | |
/// <param name="imagePaths">A path to entire images.</param> | |
/// <returns>Returns a list of strings.</returns> | |
private static List<List<string>> ExtractTextFromImage(List<string> imagePaths) | |
{ | |
List<List<string>> allTexts = new(); | |
// Initialize the Tesseract engine | |
string TessData = @"C:..\LANGUAGE_DATA_FOLDER_LOCATION"; | |
using var engine = new Tesseract.TesseractEngine(TessData, "eng", Tesseract.EngineMode.Default); | |
foreach (var imagePath in imagePaths) | |
{ | |
// Load and process the image | |
using var img = Tesseract.Pix.LoadFromFile(imagePath); | |
using var page = engine.Process(img); | |
// Fetch and split the extracted text | |
string extractedTexts = page.GetText(); | |
string[] words = extractedTexts.Split(new[] { ' ', '\t', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); | |
allTexts.Add(new List<string>(words)); | |
} | |
// Return the extracted text as a list | |
return allTexts; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment