Created
November 21, 2025 01:08
-
-
Save Alchemyst0x/1c7680bea28f003ebbe1852ad6b7dbb2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ | |
| # "hishel[httpx]", | |
| # "tqdm", | |
| # ] | |
| # /// | |
| import asyncio | |
| import re | |
| from pathlib import Path | |
| import httpx | |
| from hishel import AsyncSqliteStorage | |
| from hishel.httpx import AsyncCacheClient | |
| from tqdm.asyncio import tqdm as async_tqdm | |
| DISCORD_DATA_DIR = '/path/to/your/unzipped/discord/data/package/dir/with/json/files' | |
| OUTPUT_DIR = './discord_cached_images' | |
| CACHE_DIR = './http_cache' | |
| MAX_CONCURRENT_DOWNLOADS = 10 | |
| RETRY_ATTEMPTS = 3 | |
| RETRY_DELAY = 2.0 | |
| DISCORD_CDN_REGEX = re.compile(r'https://cdn\.discordapp\.com/attachments/[^"\s]+') | |
| async def extract_urls_from_json_file(filepath: Path) -> set[str]: | |
| urls = set() | |
| try: | |
| content = Path(filepath).read_text() | |
| found_in_text = DISCORD_CDN_REGEX.findall(content) | |
| urls.update(found_in_text) | |
| except Exception as e: | |
| print(f'Error processing file {filepath}: {e}') | |
| return urls | |
| async def download_image_async( | |
| client: AsyncCacheClient, | |
| url: str, | |
| output_dir: Path, | |
| retries: int, | |
| delay: float, | |
| semaphore: asyncio.Semaphore, | |
| ) -> Path | None: | |
| filename = url.split('/')[-1].split('?')[0] | |
| # Truncate filename if too long to avoid filesystem issues | |
| if len(filename) > 255: | |
| name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '') | |
| if len(name) > 250: | |
| name = name[:250] | |
| filename = f'{name}.{ext}' if ext else name | |
| filepath = output_dir / filename | |
| if filepath.exists(): | |
| return filepath | |
| async with semaphore: | |
| for attempt in range(retries): | |
| try: | |
| response = await client.get(url, timeout=30.0) | |
| response.raise_for_status() | |
| filepath.parent.mkdir(parents=True, exist_ok=True) | |
| content = response.content | |
| Path(filepath).write_bytes(content) | |
| return filepath # noqa: TRY300 | |
| except httpx.HTTPStatusError as e: | |
| if e.response.status_code == 404: | |
| return None # Don't retry 404s | |
| print(f'\nHTTP Error for {filename} (Attempt {attempt + 1}/{retries}): {e}') | |
| except httpx.RequestError as e: | |
| print(f'\nRequest Error for {filename} (Attempt {attempt + 1}/{retries}): {e}') | |
| except Exception as e: | |
| print(f'\nAn unexpected error occurred for {filename} (Attempt {attempt + 1}/{retries}): {e}') | |
| if attempt < retries - 1: | |
| await asyncio.sleep(delay) | |
| return None | |
| async def main() -> None: | |
| discord_data_path = Path(DISCORD_DATA_DIR) | |
| output_path = Path(OUTPUT_DIR) | |
| cache_path = Path(CACHE_DIR) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| cache_path.mkdir(parents=True, exist_ok=True) | |
| print(f'Output directory: {output_path.resolve()}') | |
| print(f'HTTP cache directory: {cache_path.resolve()}') | |
| all_json_files = list(discord_data_path.rglob('*.json')) | |
| if not all_json_files: | |
| print(f"No JSON files found in '{discord_data_path.resolve()}'. Please check the path.") | |
| return | |
| print(f'Found {len(all_json_files)} JSON files. Extracting URLs...') | |
| unique_urls: set[str] = set() | |
| for json_file in async_tqdm(all_json_files, desc='Extracting URLs'): | |
| urls_in_file = await extract_urls_from_json_file(json_file) | |
| unique_urls.update(urls_in_file) | |
| if not unique_urls: | |
| print('No Discord CDN image URLs found in the provided JSON files.') | |
| return | |
| print(f'Found {len(unique_urls)} unique Discord CDN URLs. Starting parallel download with caching...') | |
| # print('URLS:') | |
| # for url in unique_urls: | |
| # print(f'URL: {url}') | |
| downloaded_count = 0 | |
| failed_count = 0 | |
| storage = AsyncSqliteStorage(database_path=str(cache_path / 'cache.db')) | |
| semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS) | |
| async with AsyncCacheClient(storage=storage) as client: | |
| tasks = [ | |
| download_image_async(client, url, output_path, RETRY_ATTEMPTS, RETRY_DELAY, semaphore) | |
| for url in unique_urls | |
| ] | |
| for future in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Downloading images'): | |
| result = await future | |
| if result: | |
| downloaded_count += 1 | |
| elif result is None: | |
| failed_count += 1 | |
| print('\n--- Download Summary ---') | |
| print(f'Total URLs processed: {len(unique_urls)}') | |
| print(f'Successfully downloaded (or retrieved from cache): {downloaded_count}') | |
| print(f'Failed downloads (including 404s): {failed_count}') | |
| print(f'Images saved to: {output_path.resolve()}') | |
| print(f'HTTP cache stored in: {cache_path.resolve()}') | |
| if __name__ == '__main__': | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment