Skip to content

Instantly share code, notes, and snippets.

@dmachi
Created March 20, 2025 14:35
Show Gist options
  • Save dmachi/a4fe3f999892101036de729b18435e47 to your computer and use it in GitHub Desktop.
Save dmachi/a4fe3f999892101036de729b18435e47 to your computer and use it in GitHub Desktop.
Bulk Downloader for public net.science/sciduct files
import os
import requests
import json
import typer
import concurrent.futures
import re
from tqdm import tqdm
BASE_URL = "https://sciduct.bii.virginia.edu/fs/file/"
def fetch_metadata(path):
"""Fetch metadata of a file or folder from the API using its full path."""
url = f"{BASE_URL}{path}"
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to fetch metadata for {path}: {response.status_code}")
return None
def download_file(file_name, parent_path, local_dir, failed_downloads_file):
"""Download a file from the API and save it locally using its full path, skipping if it exists."""
file_path = f"{parent_path}/{file_name}" if parent_path else file_name
url = f"{BASE_URL}{file_path}"
headers = {"Accept": "application/octet-stream"}
local_path = os.path.join(local_dir, file_name)
# Check if file already exists and verify size
metadata = fetch_metadata(file_path)
expected_size = metadata.get("size", None) if metadata else None
if os.path.exists(local_path) and expected_size is not None:
actual_size = os.path.getsize(local_path)
if actual_size == expected_size:
print(f"Skipping already downloaded file: {local_path}")
return
response = requests.get(url, headers=headers, stream=True)
total_size = int(response.headers.get('content-length', 0))
if response.status_code == 200:
with open(local_path, "wb") as file, tqdm(
desc=f"Downloading {file_name}",
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024
) as progress_bar:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
progress_bar.update(len(chunk))
print(f"Downloaded: {local_path}")
else:
print(f"Failed to download file {file_path}: {response.status_code}")
with open(failed_downloads_file, "a") as f:
f.write(f"{url}\n")
def process_folder(folder_name, parent_path, local_dir, failed_downloads_file, max_concurrency, file_regex):
"""Process a folder by recursively downloading its contents using its full path."""
folder_path_api = f"{parent_path}/{folder_name}" if parent_path else folder_name
metadata = fetch_metadata(folder_path_api)
if not metadata or not metadata.get("isContainer", False):
print(f"Skipping non-container: {folder_path_api}")
return
folder_path = os.path.join(local_dir, folder_name)
os.makedirs(folder_path, exist_ok=True)
print(f"Created directory: {folder_path}")
# Fetch contents
url = f"{BASE_URL}{folder_path_api}/" # Trailing slash to get folder contents
response = requests.get(url)
if response.status_code == 200:
contents = response.json()
files_to_download = [item for item in contents if not item.get("isContainer", False) and (not file_regex or re.match(file_regex, item["name"]))]
folders_to_process = [item for item in contents if item.get("isContainer", False)]
with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrency) as executor:
futures = [executor.submit(download_file, item["name"], folder_path_api, folder_path, failed_downloads_file) for item in files_to_download]
concurrent.futures.wait(futures)
for folder in folders_to_process:
process_folder(folder["name"], folder_path_api, folder_path, failed_downloads_file, max_concurrency, file_regex)
else:
print(f"Failed to fetch contents for folder {folder_path_api}: {response.status_code}")
with open(failed_downloads_file, "a") as f:
f.write(f"{url}\n")
def main(source_folder: str, local_base_dir: str = "downloaded_files", max_concurrency: int = 1, file_regex: str = None):
"""Main function to start the recursive downloading process."""
max_concurrency = min(max(max_concurrency, 1), 3) # Limit concurrency to between 1 and 3
os.makedirs(local_base_dir, exist_ok=True)
failed_downloads_file = os.path.join(local_base_dir, "failed_downloads.txt")
process_folder(source_folder, "", local_base_dir, failed_downloads_file, max_concurrency, file_regex)
if __name__ == "__main__":
typer.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment