Skip to content

Instantly share code, notes, and snippets.

@apcamargo
Last active July 12, 2025 21:20
Show Gist options
  • Save apcamargo/df154081cf7b56d09a16feddfbb69ef3 to your computer and use it in GitHub Desktop.
Save apcamargo/df154081cf7b56d09a16feddfbb69ef3 to your computer and use it in GitHub Desktop.
Downloads all the assembled metagenomes available in MG-RAST
#!/usr/bin/env python
import json
import re
import sys
from typing import Generator, Dict, Any, Optional
import requests
from tqdm import tqdm
def fetch_mg_rast_data(verbose: bool = False) -> Generator[Dict[str, Any], None, None]:
"""
Retrieves metadata of assembled metagenomes using the MG-RAST search API.
Args:
verbose (bool): If True, print detailed progress messages.
Yields:
dict: Dictionaries containing the metadata of MG-RAST metagenomes.
"""
initial_url: str = (
"https://api.mg-rast.org/search?limit=1000&assembled=true&sequence_type=WGS"
)
current_url: Optional[str] = initial_url
page_count: int = 0
n_results: int = 0
if verbose:
print(f"Starting data fetch from: {initial_url}", file=sys.stderr)
while current_url:
page_count += 1
if verbose:
print(f"\nFetching page {page_count} from: {current_url}", file=sys.stderr)
try:
response = requests.get(current_url)
response.raise_for_status()
data = response.json()
if "data" in data and isinstance(data["data"], list):
for result in data["data"]:
n_results += 1
yield result
if verbose:
print(
f"Collected {len(data['data'])} entries from this page. Total collected: {n_results}",
file=sys.stderr,
)
else:
if verbose:
print(
"Warning: 'data' key not found or is not a list in the response.",
file=sys.stderr,
)
current_url = data.get("next")
if verbose:
if current_url:
print(f"Next page URL found: {current_url}", file=sys.stderr)
else:
print("No 'next' URL found. All pages fetched.", file=sys.stderr)
except requests.exceptions.RequestException as e:
print(f"Error fetching data: {e}", file=sys.stderr)
break
except json.JSONDecodeError as e:
print(f"Error decoding JSON response: {e}", file=sys.stderr)
print(
f"Problematic response content: {response.text[:500]}...",
file=sys.stderr,
)
break
except Exception as e:
print(f"An unexpected error occurred: {e}", file=sys.stderr)
break
def stream_metagenome_data(
metagenome_id: str,
prepend_id: bool = False,
remove_cov: bool = False,
verbose: bool = False,
) -> None:
"""
Stream data from MG-RAST download API for a specific metagenome.
Args:
metagenome_id (str): The MG-RAST metagenome ID to download data for.
prepend_id (bool): If True, prepend the metagenome ID to the headers.
remove_cov (bool): If True, remove coverage information from the headers.
verbose (bool): If True, print progress messages to stderr.
"""
download_url: str = f"https://api.mg-rast.org/download/{metagenome_id}?file=299.1"
if verbose:
print(f"Starting stream for metagenome: {metagenome_id}", file=sys.stderr)
print(f"URL: {download_url}", file=sys.stderr)
try:
response = requests.get(download_url, stream=True)
response.raise_for_status()
line_count: int = 0
for line in response.iter_lines():
if line:
line = line.decode("utf-8")
if remove_cov and line.startswith(">"):
line = re.sub(r"_\[cov=\d+\]$", "", line)
if prepend_id and line.startswith(">"):
line = re.sub(r"^>", f">{metagenome_id}|", line)
print(line)
if verbose:
print(
f"Finished streaming {metagenome_id}. Total lines: {line_count}",
file=sys.stderr,
)
except requests.exceptions.RequestException as e:
print(f"Error streaming data for {metagenome_id}: {e}", file=sys.stderr)
except Exception as e:
print(f"Unexpected error for {metagenome_id}: {e}", file=sys.stderr)
metagenomes: list[str] = sorted(
i["metagenome_id"] for i in fetch_mg_rast_data()
if isinstance(i.get("metagenome_id"), str)
)
for m in tqdm(metagenomes, smoothing=0, desc="Downloading metagenomes"):
stream_metagenome_data(m, prepend_id=True, remove_cov=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment