Skip to content

Instantly share code, notes, and snippets.

@m3tti
Last active May 1, 2025 14:23
Show Gist options
  • Save m3tti/d607fe5d28055b452b0966436ba28cbd to your computer and use it in GitHub Desktop.
Save m3tti/d607fe5d28055b452b0966436ba28cbd to your computer and use it in GitHub Desktop.
Convert PDF to text with pdf.js
(ns pdf-to-txt
(:require
["pdfjs-dist" :as pdfjs]))
(assoc!
pdfjs.GlobalWorkerOptions
:workerSrc "https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.mjs")
(defn cleanup-text [txt]
(.replaceAll txt "\"" "'"))
(defn ^:async page->txt [pdf page-num]
(let* [page (js-await (pdf.getPage page-num))
text-content (js-await (.getTextContent page))
items (.-items text-content)]
(.map items #(-> % .-str (str " ")))))
(defn ^:async pdf->txt [pdf]
(console.log "pdf loaded")
(let* [pages (range 1 pdf._pdfInfo.numPages)
promises (doall (map #(page->txt pdf %) pages))]
(-> (js-await (Promise.all promises))
(.join "\n")
cleanup-text)))
(defn handle [{:keys [reader file buffer finish]}]
(let [task (pdfjs.getDocument {:data reader.result})]
(->
task.promise
(.then pdf->txt)
(.then finish))))
(defn handle-onload [finish reader file]
(let [fun (if finish
finish
(fn [x] nil))]
#(pdf/handle {:reader reader :file file :buffer % :finish fun})))
(defn file-changed-listener [finish]
(fn [ev]
(let [file (get ev.target.files 0)
reader (new js/FileReader)]
(assoc! reader :onloadend
(handle-onload finish reader file))
(.readAsBinaryString reader file)))))
(defn attach-pdf-file-handler [{:keys [id finish]}]
(console.log "attached at element" id)
(-> (js/document.getElementById id)
(.addEventListener "change" (file-changed-listener finish))))
;; make CvFileHandler globally available
(set! js/window.PdfHandler attach-pdf-file-handler)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment