apcamargo · July 12, 2025 21:20
diff --git a/download_mg_rast.py b/download_mg_rast.py
 #!/usr/bin/env python

 import json
 import re
 import sys
 from typing import Generator, Dict, Any, Optional

 import requests
 from tqdm import tqdm


 def fetch_mg_rast_data(verbose: bool = False) -> Generator[Dict[str, Any], None, None]:
    """
    Retrieves metadata of assembled metagenomes using the MG-RAST search API.

    Args:
        verbose (bool): If True, print detailed progress messages.

    Yields:
        dict: Dictionaries containing the metadata of MG-RAST metagenomes.
    """
    initial_url: str = (
        "https://api.mg-rast.org/search?limit=1000&assembled=true&sequence_type=WGS"
    )
    current_url: Optional[str] = initial_url
    page_count: int = 0
    n_results: int = 0

    if verbose:
        print(f"Starting data fetch from: {initial_url}", file=sys.stderr)

    while current_url:
        page_count += 1
        if verbose:
            print(f"\nFetching page {page_count} from: {current_url}", file=sys.stderr)
        try:
            response = requests.get(current_url)
            response.raise_for_status()
            data = response.json()

            if "data" in data and isinstance(data["data"], list):
                for result in data["data"]:
                    n_results += 1
                    yield result
                if verbose:
                    print(
                        f"Collected {len(data['data'])} entries from this page. Total collected: {n_results}",
                        file=sys.stderr,
                    )
            else:
                if verbose:
                    print(
                        "Warning: 'data' key not found or is not a list in the response.",
                        file=sys.stderr,
                    )

            current_url = data.get("next")
            if verbose:
                if current_url:
                    print(f"Next page URL found: {current_url}", file=sys.stderr)
                else:
                    print("No 'next' URL found. All pages fetched.", file=sys.stderr)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}", file=sys.stderr)
            break
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON response: {e}", file=sys.stderr)
            print(
                f"Problematic response content: {response.text[:500]}...",
                file=sys.stderr,
            )
            break
        except Exception as e:
            print(f"An unexpected error occurred: {e}", file=sys.stderr)
            break


 def stream_metagenome_data(
    metagenome_id: str,
    prepend_id: bool = False,
    remove_cov: bool = False,
    verbose: bool = False,
 ) -> None:
    """
    Stream data from MG-RAST download API for a specific metagenome.

    Args:
        metagenome_id (str): The MG-RAST metagenome ID to download data for.
        prepend_id (bool): If True, prepend the metagenome ID to the headers.
        remove_cov (bool): If True, remove coverage information from the headers.
        verbose (bool): If True, print progress messages to stderr.
    """
    download_url: str = f"https://api.mg-rast.org/download/{metagenome_id}?file=299.1"

    if verbose:
        print(f"Starting stream for metagenome: {metagenome_id}", file=sys.stderr)
        print(f"URL: {download_url}", file=sys.stderr)

    try:
        response = requests.get(download_url, stream=True)
        response.raise_for_status()

        line_count: int = 0
        for line in response.iter_lines():
            if line:
                line = line.decode("utf-8")
                if remove_cov and line.startswith(">"):
                    line = re.sub(r"_\[cov=\d+\]$", "", line)
                if prepend_id and line.startswith(">"):
                    line = re.sub(r"^>", f">{metagenome_id}|", line)
                print(line)

        if verbose:
            print(
                f"Finished streaming {metagenome_id}. Total lines: {line_count}",
                file=sys.stderr,
            )

    except requests.exceptions.RequestException as e:
        print(f"Error streaming data for {metagenome_id}: {e}", file=sys.stderr)
    except Exception as e:
        print(f"Unexpected error for {metagenome_id}: {e}", file=sys.stderr)


 metagenomes: list[str] = sorted(
    i["metagenome_id"] for i in fetch_mg_rast_data()
    if isinstance(i.get("metagenome_id"), str)
 )

 for m in tqdm(metagenomes, smoothing=0, desc="Downloading metagenomes"):
    stream_metagenome_data(m, prepend_id=True, remove_cov=True)
	#!/usr/bin/env python

	import json
	import re
	import sys
	from typing import Generator, Dict, Any, Optional

	import requests
	from tqdm import tqdm


	def fetch_mg_rast_data(verbose: bool = False) -> Generator[Dict[str, Any], None, None]:
	"""
	Retrieves metadata of assembled metagenomes using the MG-RAST search API.

	Args:
	verbose (bool): If True, print detailed progress messages.

	Yields:
	dict: Dictionaries containing the metadata of MG-RAST metagenomes.
	"""
	initial_url: str = (
	"https://api.mg-rast.org/search?limit=1000&assembled=true&sequence_type=WGS"
	)
	current_url: Optional[str] = initial_url
	page_count: int = 0
	n_results: int = 0

	if verbose:
	print(f"Starting data fetch from: {initial_url}", file=sys.stderr)

	while current_url:
	page_count += 1
	if verbose:
	print(f"\nFetching page {page_count} from: {current_url}", file=sys.stderr)
	try:
	response = requests.get(current_url)
	response.raise_for_status()
	data = response.json()

	if "data" in data and isinstance(data["data"], list):
	for result in data["data"]:
	n_results += 1
	yield result
	if verbose:
	print(
	f"Collected {len(data['data'])} entries from this page. Total collected: {n_results}",
	file=sys.stderr,
	)
	else:
	if verbose:
	print(
	"Warning: 'data' key not found or is not a list in the response.",
	file=sys.stderr,
	)

	current_url = data.get("next")
	if verbose:
	if current_url:
	print(f"Next page URL found: {current_url}", file=sys.stderr)
	else:
	print("No 'next' URL found. All pages fetched.", file=sys.stderr)

	except requests.exceptions.RequestException as e:
	print(f"Error fetching data: {e}", file=sys.stderr)
	break
	except json.JSONDecodeError as e:
	print(f"Error decoding JSON response: {e}", file=sys.stderr)
	print(
	f"Problematic response content: {response.text[:500]}...",
	file=sys.stderr,
	)
	break
	except Exception as e:
	print(f"An unexpected error occurred: {e}", file=sys.stderr)
	break


	def stream_metagenome_data(
	metagenome_id: str,
	prepend_id: bool = False,
	remove_cov: bool = False,
	verbose: bool = False,
	) -> None:
	"""
	Stream data from MG-RAST download API for a specific metagenome.

	Args:
	metagenome_id (str): The MG-RAST metagenome ID to download data for.
	prepend_id (bool): If True, prepend the metagenome ID to the headers.
	remove_cov (bool): If True, remove coverage information from the headers.
	verbose (bool): If True, print progress messages to stderr.
	"""
	download_url: str = f"https://api.mg-rast.org/download/{metagenome_id}?file=299.1"

	if verbose:
	print(f"Starting stream for metagenome: {metagenome_id}", file=sys.stderr)
	print(f"URL: {download_url}", file=sys.stderr)

	try:
	response = requests.get(download_url, stream=True)
	response.raise_for_status()

	line_count: int = 0
	for line in response.iter_lines():
	if line:
	line = line.decode("utf-8")
	if remove_cov and line.startswith(">"):
	line = re.sub(r"_\[cov=\d+\]$", "", line)
	if prepend_id and line.startswith(">"):
	line = re.sub(r"^>", f">{metagenome_id}\|", line)
	print(line)

	if verbose:
	print(
	f"Finished streaming {metagenome_id}. Total lines: {line_count}",
	file=sys.stderr,
	)

	except requests.exceptions.RequestException as e:
	print(f"Error streaming data for {metagenome_id}: {e}", file=sys.stderr)
	except Exception as e:
	print(f"Unexpected error for {metagenome_id}: {e}", file=sys.stderr)


	metagenomes: list[str] = sorted(
	i["metagenome_id"] for i in fetch_mg_rast_data()
	if isinstance(i.get("metagenome_id"), str)
	)

	for m in tqdm(metagenomes, smoothing=0, desc="Downloading metagenomes"):
	stream_metagenome_data(m, prepend_id=True, remove_cov=True)