Last active
August 13, 2020 21:59
-
-
Save zachcp/57e871dcb0869937b86c83a710cc496a to your computer and use it in GitHub Desktop.
NPAtlas DataLevin Test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget https://www.npatlas.org/custom/versions/np_atlas_2020_06/NPAtlas_download.json | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(require '[datalevin.core :as d]) | |
(require '[clojure.data.json :as json]) | |
(require '[medley.core :as medley]) | |
(def np-schema | |
{ | |
:smiles {:db/valueType :db.type/string} | |
:exact_mass {:db/valueType :db.type/double} | |
:npaid {:db/valueType :db.type/long | |
:db/unique :db.unique/value} | |
:m_plus_na {:db/valueType :db.type/double} | |
:m_plus_h {:db/valueType :db.type/double} | |
:mol_formula {:db/valueType :db.type/string} | |
:name {:db/valueType :db.type/string} | |
:id {:db/valueType :db.type/long} | |
:species {:db/valueType :db.type/string} | |
:genus {:db/valueType :db.type/string} | |
:inchi {:db/valueType :db.type/string} | |
:inchikey {:db/valueType :db.type/string} | |
;:synthesis {:db/valueType :db.type/string | |
; :db/cardinality :db.cardinality/many} | |
; :external_ids flattened to the following | |
:mibig-id {:db/valueType :db.type/string} | |
:gnps-id {:db/valueType :db.type/string} | |
} | |
) | |
(defn read-np-atlas [] | |
(json/read-str (slurp "NPAtlas_download.json") :key-fn keyword)) | |
(defn remove-if-empty [coll ky] | |
" if {ky []} then {}" | |
(medley.core/remove-kv (fn [k v] (and (= k ky) (empty? v))) coll)) | |
(defn- convert-external-id-map [m] | |
{:pre [#(contains? m :external_db_name) | |
#(contains? m :external_db_code) | |
#(= 2 (count m))] | |
:post [#(string? (val %))]} | |
(let [db (get m :external_db_name) | |
val (get m :external_db_code) | |
new-id (keyword (str db "-id"))] | |
{new-id val})) | |
(defn external-ids-to-map [coll] | |
" [{:id1 1} {:id2 2}] => {:id1 1 :id2 2}" | |
(reduce merge {} (map convert-external-id-map coll))) | |
(defn flatten-external-ids [m] | |
(if-let [ex-ids (get m :external_ids)] | |
(merge (dissoc m :external_ids) (external-ids-to-map ex-ids)) | |
m)) | |
;; ---- Comment Blocks load data and then run queries. | |
(comment | |
;; Check to see how datalevin performs using a schema | |
; load data (use download script) | |
(def full-np-atlas (read-np-atlas)) | |
; create conn for data-levin | |
(def conn (d/create-conn np-schema "/tmp/datalevin-test")) | |
; takes a while. Resulting DB size ~110MB | |
(doseq [entry full-np-atlas] | |
(let [mod (-> entry | |
; handle external_ids | |
(remove-if-empty :external_ids) | |
(flatten-external-ids) | |
;(dissoc :origin_organism :origin_reference :reassignments :external_ids :syntheses) | |
(dissoc :origin_organism :origin_reference :reassignments :syntheses :node_id :cluster_id) | |
(assoc :db/id -1))] | |
(d/transact! conn [mod]))) | |
; works fine | |
(d/q '[:find ?name ?exact_mass ?e | |
:where | |
[?e :name ?name] | |
[?e :exact_mass ?exact_mass] | |
[(>= ?exact_mass 200) ]] | |
@conn) | |
; works fine | |
(d/q '[:find ?name ?e | |
:where | |
[?e :name ?name] | |
[(= ?name "Rapamycin")]] | |
@conn) | |
; broken | |
; Execution error (ExceptionInfo) at datalevin.lmdb.LMDB/get_value (lmdb.clj:706). | |
; Fail to get-value: "Thaw failed against type-id: 78" | |
(d/q '[:find ?name ?bcg ?smiles ?e | |
:where | |
[?e :name ?name] | |
[?e :mibig-id ?bcg] | |
[?e :smiles ?smiles] | |
] | |
@conn) | |
; broken | |
; Execution error (ExceptionInfo) at datalevin.lmdb.LMDB/get_value (lmdb.clj:706). | |
; Fail to get-value: "Thaw failed against type-id: 78" | |
(d/q '[:find ?name ?smiles ?e | |
:where | |
[?e :name ?name] | |
[?e :smiles ?smiles]] | |
@conn) | |
) | |
(comment | |
;; Check to see how datalevin performs using a schema | |
(def conn2 (d/create-conn {} "/tmp/datalevin-test-noschema")) | |
(doseq [entry full-np-atlas] | |
(let [mod (-> entry | |
; handle external_ids | |
(remove-if-empty :external_ids) | |
(flatten-external-ids) | |
;(dissoc :origin_organism :origin_reference :reassignments :external_ids :syntheses) | |
(dissoc :origin_organism :origin_reference :reassignments :syntheses :node_id :cluster_id) | |
(assoc :db/id -1))] | |
(d/transact! conn2 [mod]))) | |
; works fine | |
(d/q '[:find ?name ?exact_mass ?e | |
:where | |
[?e :name ?name] | |
[?e :exact_mass ?exact_mass] | |
[(>= ?exact_mass 200) ]] | |
@conn2) | |
; works fine | |
(d/q '[:find ?name ?e | |
:where | |
[?e :name ?name] | |
[(= ?name "Rapamycin")]] | |
@conn2) | |
; broken | |
; Execution error (ExceptionInfo) at datalevin.lmdb.LMDB/get_value (lmdb.clj:706). | |
; Fail to get-value: "Thaw failed against type-id: 16" | |
(d/q '[:find ?name ?bcg ?smiles ?e | |
:where | |
[?e :name ?name] | |
[?e :mibig-id ?bcg] | |
[?e :smiles ?smiles] | |
] | |
@conn2) | |
; empty set? | |
(d/q '[:find ?name ?smiles ?e | |
:where | |
[?e :name ?name] | |
[?e :smiles ?smiles] | |
] | |
@conn) | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment