Last active
May 1, 2025 14:23
-
-
Save m3tti/d607fe5d28055b452b0966436ba28cbd to your computer and use it in GitHub Desktop.
Convert PDF to text with pdf.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns pdf-to-txt | |
(:require | |
["pdfjs-dist" :as pdfjs])) | |
(assoc! | |
pdfjs.GlobalWorkerOptions | |
:workerSrc "https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.mjs") | |
(defn cleanup-text [txt] | |
(.replaceAll txt "\"" "'")) | |
(defn ^:async page->txt [pdf page-num] | |
(let* [page (js-await (pdf.getPage page-num)) | |
text-content (js-await (.getTextContent page)) | |
items (.-items text-content)] | |
(.map items #(-> % .-str (str " "))))) | |
(defn ^:async pdf->txt [pdf] | |
(console.log "pdf loaded") | |
(let* [pages (range 1 pdf._pdfInfo.numPages) | |
promises (doall (map #(page->txt pdf %) pages))] | |
(-> (js-await (Promise.all promises)) | |
(.join "\n") | |
cleanup-text))) | |
(defn handle [{:keys [reader file buffer finish]}] | |
(let [task (pdfjs.getDocument {:data reader.result})] | |
(-> | |
task.promise | |
(.then pdf->txt) | |
(.then finish)))) | |
(defn handle-onload [finish reader file] | |
(let [fun (if finish | |
finish | |
(fn [x] nil))] | |
#(pdf/handle {:reader reader :file file :buffer % :finish fun}))) | |
(defn file-changed-listener [finish] | |
(fn [ev] | |
(let [file (get ev.target.files 0) | |
reader (new js/FileReader)] | |
(assoc! reader :onloadend | |
(handle-onload finish reader file)) | |
(.readAsBinaryString reader file))))) | |
(defn attach-pdf-file-handler [{:keys [id finish]}] | |
(console.log "attached at element" id) | |
(-> (js/document.getElementById id) | |
(.addEventListener "change" (file-changed-listener finish)))) | |
;; make CvFileHandler globally available | |
(set! js/window.PdfHandler attach-pdf-file-handler) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment