Created
June 17, 2021 21:22
-
-
Save KevinDanikowski/25cdcdda2ef4750bcf443f2027cc375a to your computer and use it in GitHub Desktop.
useTesseract Hook
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { useState, useEffect } from 'react' | |
import { createWorker } from 'tesseract.js' | |
export default function useTesseract({ tesseractLanguage = 'eng', log = false }) { | |
const [tesseractWorker, setTesseractWorker] = useState(null) | |
const [loadingModel, setLoadingModel] = useState(true) | |
const [modelError, setModelError] = useState(false) | |
const [imgResults, setImgResults] = useState({}) | |
const [processing, setProcessing] = useState(false) | |
const [progress, setProgress] = useState(0) | |
const extractTextFromImage = (imageUrl) => { | |
const recognize = async () => { | |
const { | |
data: { | |
hocr: htmlOutput, | |
text, | |
// tsv, box, unlv | |
}, | |
} = await tesseractWorker.recognize(imageUrl) | |
setProcessing(false) | |
setImgResults({ html: htmlOutput, text }) | |
} | |
if (loadingModel) { | |
try { | |
setTimeout(recognize, 400) | |
} catch (e) { | |
console.error('Timeout Error:', e.message) | |
setImgResults({ error: true }) | |
} | |
} else { | |
try { | |
setProcessing(true) | |
recognize() | |
} catch (e) { | |
console.error('Tesseract Error:', e.message) | |
setProcessing(false) | |
setImgResults({ error: true }) | |
} | |
} | |
} | |
const logger = (m) => { | |
setProgress(m.progress) | |
if (log) { | |
console.info(m) | |
} | |
} | |
useEffect(() => { | |
const loadTesseract = async () => { | |
if (tesseractWorker) { | |
await tesseractWorker.loadLanguage(tesseractLanguage) | |
await tesseractWorker.initialize(tesseractLanguage) | |
console.info(`INFO: loaded ${tesseractLanguage} tesseract model`) | |
} else { | |
const tesseractWorker = createWorker({ | |
logger, | |
// specify paths because sometimes the free CDN goes down | |
// corePath: '/static/tesseract-core.wasm.2.2.0.js', | |
// workerPath: '/static/tesseract-worker.v2.1.4.min.js', | |
}) | |
setTesseractWorker(tesseractWorker) | |
await tesseractWorker.load() | |
await tesseractWorker.loadLanguage(tesseractLanguage) | |
await tesseractWorker.initialize(tesseractLanguage) | |
console.info(`INFO: loaded ${tesseractLanguage} tesseract model`) | |
setLoadingModel(false) | |
setModelError(true) | |
setLoadingModel(false) | |
} | |
} | |
loadTesseract().catch((e) => { | |
console.error(`ERROR: Failed to load tesseract model`, e.message) | |
setModelError(true) | |
setLoadingModel(false) | |
}) | |
// TODO: Have to add a ref to reference the latest tesseractWorker in order to terminate | |
// return () => tesseractWorker.terminate() | |
}, [tesseractLanguage]) | |
return { | |
imgResults, | |
loadingModel, | |
processing, | |
modelError, | |
progress, | |
extractTextFromImage, | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment