Created
November 19, 2023 17:34
-
-
Save ekzhang/f521f3de121a52c7cb13b6561d94c147 to your computer and use it in GitHub Desktop.
Fast API for plasmid data from PLSDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import subprocess | |
import numpy as np | |
import pandas as pd | |
from fastapi import HTTPException | |
from modal import Image, Stub, web_endpoint | |
stub = Stub("plasmid-data") | |
META_LINK = "https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plasmids_meta.tar.bz2" | |
FASTA_LINK = ( | |
"https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/download/plsdb.fna.bz2" | |
) | |
def compute_offsets(): | |
"""Compute byte-offsets of each plasmid in the FASTA file for quick access.""" | |
result = subprocess.run( | |
"rg -b '^>' -N /data/plsdb.fna", shell=True, capture_output=True, text=True | |
) | |
offsets: dict[str, int] = {} | |
for line in result.stdout.splitlines(): | |
if line: | |
num, desc = line.split(":>", 1) | |
offset = int(num) | |
id = desc.split(" ", 1)[0] | |
offsets[id] = offset | |
with open("/data/offsets.json", "w") as f: | |
json.dump(offsets, f) | |
stub.image = ( | |
Image.debian_slim() | |
.apt_install("ripgrep", "bzip2", "curl") | |
.run_commands( | |
"mkdir -p /data", | |
f"curl -L {META_LINK} | tar xvjf - -C /data", | |
f"curl -L {FASTA_LINK} | bunzip2 > /data/plsdb.fna", | |
) | |
.pip_install("pandas", "requests", "numpy") | |
.run_function(compute_offsets) | |
) | |
@stub.cls(keep_warm=True, allow_concurrent_inputs=64) | |
class PlasmidData: | |
def __enter__(self): | |
with open("/data/offsets.json") as f: | |
self.offsets = json.load(f) | |
self.metadata = pd.read_csv( | |
"/data/plsdb.tsv", sep="\t", low_memory=False, index_col=1 | |
) | |
self.annotations = pd.read_csv("/data/plsdb.abr", sep="\t") | |
@web_endpoint(method="GET", custom_domains=["plasmid-data.modal.ekzhang.com"]) | |
def get(self, id: str | None = None): | |
"""Get the metadata for a plasmid.""" | |
if id is None: | |
ids = list(self.offsets.keys()) | |
descriptions = self.metadata["NUCCORE_Description"][ids] | |
return {"ids": ids, "descriptions": descriptions.to_list()} | |
offset = self.offsets.get(id) | |
if offset is None: | |
raise HTTPException(404, "Plasmid not found") | |
try: | |
metadata = self.metadata.loc[id].replace({np.nan: None}).to_dict() | |
except KeyError: | |
metadata = None | |
annotations = [] | |
for _, row in self.annotations[self.annotations.qseqid == id].iterrows(): | |
annotations.append(row.replace({np.nan: None}).to_dict()) | |
with open("/data/plsdb.fna") as f: | |
f.seek(offset) | |
header = f.readline().rstrip() | |
sequence: list[str] = [] | |
for line in f: | |
if line.startswith(">"): | |
break | |
sequence.append(line.rstrip()) | |
return { | |
"id": id, | |
"metadata": metadata, | |
"annotations": annotations, | |
"header": header, | |
"sequence": "".join(sequence), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment