Skip to content

Instantly share code, notes, and snippets.

@incubated-geek-cc
Created February 13, 2024 16:04
Show Gist options
  • Save incubated-geek-cc/287dc16cd13767e2db340aa5c7258b5e to your computer and use it in GitHub Desktop.
Save incubated-geek-cc/287dc16cd13767e2db340aa5c7258b5e to your computer and use it in GitHub Desktop.
Code snippet demo for step 2.
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
public class Test {
public static void main(String[] args) {
Tesseract instance = new Tesseract();
int imageDPI = 300;
Path currentRelativePath = Paths.get("");
String s = currentRelativePath.toAbsolutePath().toString();
File dataDir = new File(s, "tessdata");
if(dataDir.exists()) {
instance.setDatapath(dataDir.getAbsolutePath());
/* START TO DO LOGIC HERE*/
instance.setLanguage("eng+osd+equ");
/*
OCR engine modes (oem)
0. Legacy engine only
1. Neural nets LSTM engine only
2. Legacy + LSTM engines
3. Default, based on what is available
*/
instance.setOcrEngineMode(1);
instance.setTessVariable("user_defined_dpi", imageDPI+"");
/* [START] Extract Text from PDF */
try {
File selectedFile = new File("sample.pdf");
PDDocument document = Loader.loadPDF(selectedFile);
int totalNoOfPages=document.getNumberOfPages();
Splitter splitter = new Splitter();
List<PDDocument> pages = splitter.split(document);
PDDocument page = null;
for (int p = 0; p < totalNoOfPages; p++) { // FOR-EACH FILE
page = pages.get(p);
PDFRenderer pdfRenderer = new PDFRenderer(page);
BufferedImage tempPageImg = pdfRenderer.renderImageWithDPI(0, imageDPI, ImageType.RGB);
String text = instance.doOCR(tempPageImg);
System.out.println("[Start] Page "+(p+1)+" ----------------------------------------------");
System.out.println(text);
System.out.println("[End] Page "+(p+1)+" ------------------------------------------------");
}
/* //END TO DO LOGIC HERE*/
// String text = instance.doOCR(new File("bday_card.jpg"));
// System.out.print(text);
/* [END] Extract Text from PDF */
} catch (TesseractException | IOException ex) {
ex.printStackTrace();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment