Last active
July 12, 2025 21:20
-
-
Save apcamargo/df154081cf7b56d09a16feddfbb69ef3 to your computer and use it in GitHub Desktop.
Downloads all the assembled metagenomes available in MG-RAST
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import json | |
import re | |
import sys | |
from typing import Generator, Dict, Any, Optional | |
import requests | |
from tqdm import tqdm | |
def fetch_mg_rast_data(verbose: bool = False) -> Generator[Dict[str, Any], None, None]: | |
""" | |
Retrieves metadata of assembled metagenomes using the MG-RAST search API. | |
Args: | |
verbose (bool): If True, print detailed progress messages. | |
Yields: | |
dict: Dictionaries containing the metadata of MG-RAST metagenomes. | |
""" | |
initial_url: str = ( | |
"https://api.mg-rast.org/search?limit=1000&assembled=true&sequence_type=WGS" | |
) | |
current_url: Optional[str] = initial_url | |
page_count: int = 0 | |
n_results: int = 0 | |
if verbose: | |
print(f"Starting data fetch from: {initial_url}", file=sys.stderr) | |
while current_url: | |
page_count += 1 | |
if verbose: | |
print(f"\nFetching page {page_count} from: {current_url}", file=sys.stderr) | |
try: | |
response = requests.get(current_url) | |
response.raise_for_status() | |
data = response.json() | |
if "data" in data and isinstance(data["data"], list): | |
for result in data["data"]: | |
n_results += 1 | |
yield result | |
if verbose: | |
print( | |
f"Collected {len(data['data'])} entries from this page. Total collected: {n_results}", | |
file=sys.stderr, | |
) | |
else: | |
if verbose: | |
print( | |
"Warning: 'data' key not found or is not a list in the response.", | |
file=sys.stderr, | |
) | |
current_url = data.get("next") | |
if verbose: | |
if current_url: | |
print(f"Next page URL found: {current_url}", file=sys.stderr) | |
else: | |
print("No 'next' URL found. All pages fetched.", file=sys.stderr) | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching data: {e}", file=sys.stderr) | |
break | |
except json.JSONDecodeError as e: | |
print(f"Error decoding JSON response: {e}", file=sys.stderr) | |
print( | |
f"Problematic response content: {response.text[:500]}...", | |
file=sys.stderr, | |
) | |
break | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}", file=sys.stderr) | |
break | |
def stream_metagenome_data( | |
metagenome_id: str, | |
prepend_id: bool = False, | |
remove_cov: bool = False, | |
verbose: bool = False, | |
) -> None: | |
""" | |
Stream data from MG-RAST download API for a specific metagenome. | |
Args: | |
metagenome_id (str): The MG-RAST metagenome ID to download data for. | |
prepend_id (bool): If True, prepend the metagenome ID to the headers. | |
remove_cov (bool): If True, remove coverage information from the headers. | |
verbose (bool): If True, print progress messages to stderr. | |
""" | |
download_url: str = f"https://api.mg-rast.org/download/{metagenome_id}?file=299.1" | |
if verbose: | |
print(f"Starting stream for metagenome: {metagenome_id}", file=sys.stderr) | |
print(f"URL: {download_url}", file=sys.stderr) | |
try: | |
response = requests.get(download_url, stream=True) | |
response.raise_for_status() | |
line_count: int = 0 | |
for line in response.iter_lines(): | |
if line: | |
line = line.decode("utf-8") | |
if remove_cov and line.startswith(">"): | |
line = re.sub(r"_\[cov=\d+\]$", "", line) | |
if prepend_id and line.startswith(">"): | |
line = re.sub(r"^>", f">{metagenome_id}|", line) | |
print(line) | |
if verbose: | |
print( | |
f"Finished streaming {metagenome_id}. Total lines: {line_count}", | |
file=sys.stderr, | |
) | |
except requests.exceptions.RequestException as e: | |
print(f"Error streaming data for {metagenome_id}: {e}", file=sys.stderr) | |
except Exception as e: | |
print(f"Unexpected error for {metagenome_id}: {e}", file=sys.stderr) | |
metagenomes: list[str] = sorted( | |
i["metagenome_id"] for i in fetch_mg_rast_data() | |
if isinstance(i.get("metagenome_id"), str) | |
) | |
for m in tqdm(metagenomes, smoothing=0, desc="Downloading metagenomes"): | |
stream_metagenome_data(m, prepend_id=True, remove_cov=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment