Skip to content

Instantly share code, notes, and snippets.

@Alchemyst0x
Created November 21, 2025 01:08
Show Gist options
  • Select an option

  • Save Alchemyst0x/1c7680bea28f003ebbe1852ad6b7dbb2 to your computer and use it in GitHub Desktop.

Select an option

Save Alchemyst0x/1c7680bea28f003ebbe1852ad6b7dbb2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "hishel[httpx]",
# "tqdm",
# ]
# ///
import asyncio
import re
from pathlib import Path
import httpx
from hishel import AsyncSqliteStorage
from hishel.httpx import AsyncCacheClient
from tqdm.asyncio import tqdm as async_tqdm
DISCORD_DATA_DIR = '/path/to/your/unzipped/discord/data/package/dir/with/json/files'
OUTPUT_DIR = './discord_cached_images'
CACHE_DIR = './http_cache'
MAX_CONCURRENT_DOWNLOADS = 10
RETRY_ATTEMPTS = 3
RETRY_DELAY = 2.0
DISCORD_CDN_REGEX = re.compile(r'https://cdn\.discordapp\.com/attachments/[^"\s]+')
async def extract_urls_from_json_file(filepath: Path) -> set[str]:
urls = set()
try:
content = Path(filepath).read_text()
found_in_text = DISCORD_CDN_REGEX.findall(content)
urls.update(found_in_text)
except Exception as e:
print(f'Error processing file {filepath}: {e}')
return urls
async def download_image_async(
client: AsyncCacheClient,
url: str,
output_dir: Path,
retries: int,
delay: float,
semaphore: asyncio.Semaphore,
) -> Path | None:
filename = url.split('/')[-1].split('?')[0]
# Truncate filename if too long to avoid filesystem issues
if len(filename) > 255:
name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
if len(name) > 250:
name = name[:250]
filename = f'{name}.{ext}' if ext else name
filepath = output_dir / filename
if filepath.exists():
return filepath
async with semaphore:
for attempt in range(retries):
try:
response = await client.get(url, timeout=30.0)
response.raise_for_status()
filepath.parent.mkdir(parents=True, exist_ok=True)
content = response.content
Path(filepath).write_bytes(content)
return filepath # noqa: TRY300
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
return None # Don't retry 404s
print(f'\nHTTP Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
except httpx.RequestError as e:
print(f'\nRequest Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
except Exception as e:
print(f'\nAn unexpected error occurred for {filename} (Attempt {attempt + 1}/{retries}): {e}')
if attempt < retries - 1:
await asyncio.sleep(delay)
return None
async def main() -> None:
discord_data_path = Path(DISCORD_DATA_DIR)
output_path = Path(OUTPUT_DIR)
cache_path = Path(CACHE_DIR)
output_path.mkdir(parents=True, exist_ok=True)
cache_path.mkdir(parents=True, exist_ok=True)
print(f'Output directory: {output_path.resolve()}')
print(f'HTTP cache directory: {cache_path.resolve()}')
all_json_files = list(discord_data_path.rglob('*.json'))
if not all_json_files:
print(f"No JSON files found in '{discord_data_path.resolve()}'. Please check the path.")
return
print(f'Found {len(all_json_files)} JSON files. Extracting URLs...')
unique_urls: set[str] = set()
for json_file in async_tqdm(all_json_files, desc='Extracting URLs'):
urls_in_file = await extract_urls_from_json_file(json_file)
unique_urls.update(urls_in_file)
if not unique_urls:
print('No Discord CDN image URLs found in the provided JSON files.')
return
print(f'Found {len(unique_urls)} unique Discord CDN URLs. Starting parallel download with caching...')
# print('URLS:')
# for url in unique_urls:
# print(f'URL: {url}')
downloaded_count = 0
failed_count = 0
storage = AsyncSqliteStorage(database_path=str(cache_path / 'cache.db'))
semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)
async with AsyncCacheClient(storage=storage) as client:
tasks = [
download_image_async(client, url, output_path, RETRY_ATTEMPTS, RETRY_DELAY, semaphore)
for url in unique_urls
]
for future in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Downloading images'):
result = await future
if result:
downloaded_count += 1
elif result is None:
failed_count += 1
print('\n--- Download Summary ---')
print(f'Total URLs processed: {len(unique_urls)}')
print(f'Successfully downloaded (or retrieved from cache): {downloaded_count}')
print(f'Failed downloads (including 404s): {failed_count}')
print(f'Images saved to: {output_path.resolve()}')
print(f'HTTP cache stored in: {cache_path.resolve()}')
if __name__ == '__main__':
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment