Created
February 13, 2024 16:04
-
-
Save incubated-geek-cc/287dc16cd13767e2db340aa5c7258b5e to your computer and use it in GitHub Desktop.
Code snippet demo for step 2.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.awt.image.BufferedImage; | |
import java.io.File; | |
import java.io.IOException; | |
import java.nio.file.Path; | |
import java.nio.file.Paths; | |
import java.util.List; | |
import net.sourceforge.tess4j.Tesseract; | |
import net.sourceforge.tess4j.TesseractException; | |
import org.apache.pdfbox.Loader; | |
import org.apache.pdfbox.multipdf.Splitter; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.rendering.ImageType; | |
import org.apache.pdfbox.rendering.PDFRenderer; | |
public class Test { | |
public static void main(String[] args) { | |
Tesseract instance = new Tesseract(); | |
int imageDPI = 300; | |
Path currentRelativePath = Paths.get(""); | |
String s = currentRelativePath.toAbsolutePath().toString(); | |
File dataDir = new File(s, "tessdata"); | |
if(dataDir.exists()) { | |
instance.setDatapath(dataDir.getAbsolutePath()); | |
/* START TO DO LOGIC HERE*/ | |
instance.setLanguage("eng+osd+equ"); | |
/* | |
OCR engine modes (oem) | |
0. Legacy engine only | |
1. Neural nets LSTM engine only | |
2. Legacy + LSTM engines | |
3. Default, based on what is available | |
*/ | |
instance.setOcrEngineMode(1); | |
instance.setTessVariable("user_defined_dpi", imageDPI+""); | |
/* [START] Extract Text from PDF */ | |
try { | |
File selectedFile = new File("sample.pdf"); | |
PDDocument document = Loader.loadPDF(selectedFile); | |
int totalNoOfPages=document.getNumberOfPages(); | |
Splitter splitter = new Splitter(); | |
List<PDDocument> pages = splitter.split(document); | |
PDDocument page = null; | |
for (int p = 0; p < totalNoOfPages; p++) { // FOR-EACH FILE | |
page = pages.get(p); | |
PDFRenderer pdfRenderer = new PDFRenderer(page); | |
BufferedImage tempPageImg = pdfRenderer.renderImageWithDPI(0, imageDPI, ImageType.RGB); | |
String text = instance.doOCR(tempPageImg); | |
System.out.println("[Start] Page "+(p+1)+" ----------------------------------------------"); | |
System.out.println(text); | |
System.out.println("[End] Page "+(p+1)+" ------------------------------------------------"); | |
} | |
/* //END TO DO LOGIC HERE*/ | |
// String text = instance.doOCR(new File("bday_card.jpg")); | |
// System.out.print(text); | |
/* [END] Extract Text from PDF */ | |
} catch (TesseractException | IOException ex) { | |
ex.printStackTrace(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment