Last active
April 16, 2019 13:46
-
-
Save firthh/00be4bf01268377bc306f2ddba09965f to your computer and use it in GitHub Desktop.
Lazily read a gzipped tar archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(require [clojure.data.xml :as xml]) | |
(import [java.io FileInputStream] | |
[java.util.zip GZIPInputStream] | |
[org.apache.commons.compress.compressors.gzip GzipCompressorInputStream] | |
[org.apache.commons.compress.archivers.tar TarArchiveInputStream TarArchiveEntry]) | |
(defn get-s3-file-stream [bucket key] | |
(-> (s3/get-object {:endpoint "eu-west-1"} | |
:bucket-name bucket | |
:key key) | |
:input-stream)) | |
(defn extract-files-from-gz-tar [in] | |
(let [^GzipCompressorInputStream gzipIn (GzipCompressorInputStream. in) | |
^TarArchiveInputStream tarIn (TarArchiveInputStream. gzipIn) | |
fn #(let [^TarArchiveEntry entry (.getNextEntry tarIn)] | |
(if entry | |
(when (not (.isDirectory entry)) | |
(let [size (.getSize entry) | |
bytes (byte-array size)] | |
(.read tarIn bytes 0 size) | |
(String. bytes))) | |
:empty))] | |
(->> (repeatedly fn) | |
(filter (comp not nil?)) | |
(take-while #(not (= :empty %)))))) | |
(with-open [stream (get-s3-file-stream "bucket" "file.gz.tar")] | |
(->> (extract-files-from-gz-tar stream) | |
(map xml/parse-str) | |
(take 10) | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment