Alchemyst0x · November 21, 2025 01:08
diff --git a/dl_disc.py b/dl_disc.py
 #!/usr/bin/env -S uv run --script

 # /// script
 # requires-python = ">=3.13"
 # dependencies = [
 #     "hishel[httpx]",
 #     "tqdm",
 # ]
 # ///

 import asyncio
 import re
 from pathlib import Path

 import httpx
 from hishel import AsyncSqliteStorage
 from hishel.httpx import AsyncCacheClient
 from tqdm.asyncio import tqdm as async_tqdm

 DISCORD_DATA_DIR = '/path/to/your/unzipped/discord/data/package/dir/with/json/files'
 OUTPUT_DIR = './discord_cached_images'
 CACHE_DIR = './http_cache'
 MAX_CONCURRENT_DOWNLOADS = 10
 RETRY_ATTEMPTS = 3
 RETRY_DELAY = 2.0

 DISCORD_CDN_REGEX = re.compile(r'https://cdn\.discordapp\.com/attachments/[^"\s]+')


 async def extract_urls_from_json_file(filepath: Path) -> set[str]:
    urls = set()
    try:
        content = Path(filepath).read_text()
        found_in_text = DISCORD_CDN_REGEX.findall(content)
        urls.update(found_in_text)
    except Exception as e:
        print(f'Error processing file {filepath}: {e}')
    return urls


 async def download_image_async(
    client: AsyncCacheClient,
    url: str,
    output_dir: Path,
    retries: int,
    delay: float,
    semaphore: asyncio.Semaphore,
 ) -> Path | None:
    filename = url.split('/')[-1].split('?')[0]
    # Truncate filename if too long to avoid filesystem issues
    if len(filename) > 255:
        name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
        if len(name) > 250:
            name = name[:250]
        filename = f'{name}.{ext}' if ext else name
    filepath = output_dir / filename

    if filepath.exists():
        return filepath

    async with semaphore:
        for attempt in range(retries):
            try:
                response = await client.get(url, timeout=30.0)
                response.raise_for_status()
                filepath.parent.mkdir(parents=True, exist_ok=True)
                content = response.content
                Path(filepath).write_bytes(content)
                return filepath  # noqa: TRY300

            except httpx.HTTPStatusError as e:
                if e.response.status_code == 404:
                    return None  # Don't retry 404s
                print(f'\nHTTP Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
            except httpx.RequestError as e:
                print(f'\nRequest Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
            except Exception as e:
                print(f'\nAn unexpected error occurred for {filename} (Attempt {attempt + 1}/{retries}): {e}')

            if attempt < retries - 1:
                await asyncio.sleep(delay)

    return None


 async def main() -> None:
    discord_data_path = Path(DISCORD_DATA_DIR)
    output_path = Path(OUTPUT_DIR)
    cache_path = Path(CACHE_DIR)

    output_path.mkdir(parents=True, exist_ok=True)
    cache_path.mkdir(parents=True, exist_ok=True)
    print(f'Output directory: {output_path.resolve()}')
    print(f'HTTP cache directory: {cache_path.resolve()}')

    all_json_files = list(discord_data_path.rglob('*.json'))
    if not all_json_files:
        print(f"No JSON files found in '{discord_data_path.resolve()}'. Please check the path.")
        return

    print(f'Found {len(all_json_files)} JSON files. Extracting URLs...')
    unique_urls: set[str] = set()
    for json_file in async_tqdm(all_json_files, desc='Extracting URLs'):
        urls_in_file = await extract_urls_from_json_file(json_file)
        unique_urls.update(urls_in_file)
    if not unique_urls:
        print('No Discord CDN image URLs found in the provided JSON files.')
        return

    print(f'Found {len(unique_urls)} unique Discord CDN URLs. Starting parallel download with caching...')
    # print('URLS:')
    # for url in unique_urls:
    #    print(f'URL: {url}')

    downloaded_count = 0
    failed_count = 0

    storage = AsyncSqliteStorage(database_path=str(cache_path / 'cache.db'))
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)

    async with AsyncCacheClient(storage=storage) as client:
        tasks = [
            download_image_async(client, url, output_path, RETRY_ATTEMPTS, RETRY_DELAY, semaphore)
            for url in unique_urls
        ]

        for future in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Downloading images'):
            result = await future
            if result:
                downloaded_count += 1
            elif result is None:
                failed_count += 1

    print('\n--- Download Summary ---')
    print(f'Total URLs processed: {len(unique_urls)}')
    print(f'Successfully downloaded (or retrieved from cache): {downloaded_count}')
    print(f'Failed downloads (including 404s): {failed_count}')
    print(f'Images saved to: {output_path.resolve()}')
    print(f'HTTP cache stored in: {cache_path.resolve()}')


 if __name__ == '__main__':
    asyncio.run(main())
	#!/usr/bin/env -S uv run --script

	# /// script
	# requires-python = ">=3.13"
	# dependencies = [
	# "hishel[httpx]",
	# "tqdm",
	# ]
	# ///

	import asyncio
	import re
	from pathlib import Path

	import httpx
	from hishel import AsyncSqliteStorage
	from hishel.httpx import AsyncCacheClient
	from tqdm.asyncio import tqdm as async_tqdm

	DISCORD_DATA_DIR = '/path/to/your/unzipped/discord/data/package/dir/with/json/files'
	OUTPUT_DIR = './discord_cached_images'
	CACHE_DIR = './http_cache'
	MAX_CONCURRENT_DOWNLOADS = 10
	RETRY_ATTEMPTS = 3
	RETRY_DELAY = 2.0

	DISCORD_CDN_REGEX = re.compile(r'https://cdn\.discordapp\.com/attachments/[^"\s]+')


	async def extract_urls_from_json_file(filepath: Path) -> set[str]:
	urls = set()
	try:
	content = Path(filepath).read_text()
	found_in_text = DISCORD_CDN_REGEX.findall(content)
	urls.update(found_in_text)
	except Exception as e:
	print(f'Error processing file {filepath}: {e}')
	return urls


	async def download_image_async(
	client: AsyncCacheClient,
	url: str,
	output_dir: Path,
	retries: int,
	delay: float,
	semaphore: asyncio.Semaphore,
	) -> Path \| None:
	filename = url.split('/')[-1].split('?')[0]
	# Truncate filename if too long to avoid filesystem issues
	if len(filename) > 255:
	name, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
	if len(name) > 250:
	name = name[:250]
	filename = f'{name}.{ext}' if ext else name
	filepath = output_dir / filename

	if filepath.exists():
	return filepath

	async with semaphore:
	for attempt in range(retries):
	try:
	response = await client.get(url, timeout=30.0)
	response.raise_for_status()
	filepath.parent.mkdir(parents=True, exist_ok=True)
	content = response.content
	Path(filepath).write_bytes(content)
	return filepath # noqa: TRY300

	except httpx.HTTPStatusError as e:
	if e.response.status_code == 404:
	return None # Don't retry 404s
	print(f'\nHTTP Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
	except httpx.RequestError as e:
	print(f'\nRequest Error for {filename} (Attempt {attempt + 1}/{retries}): {e}')
	except Exception as e:
	print(f'\nAn unexpected error occurred for {filename} (Attempt {attempt + 1}/{retries}): {e}')

	if attempt < retries - 1:
	await asyncio.sleep(delay)

	return None


	async def main() -> None:
	discord_data_path = Path(DISCORD_DATA_DIR)
	output_path = Path(OUTPUT_DIR)
	cache_path = Path(CACHE_DIR)

	output_path.mkdir(parents=True, exist_ok=True)
	cache_path.mkdir(parents=True, exist_ok=True)
	print(f'Output directory: {output_path.resolve()}')
	print(f'HTTP cache directory: {cache_path.resolve()}')

	all_json_files = list(discord_data_path.rglob('*.json'))
	if not all_json_files:
	print(f"No JSON files found in '{discord_data_path.resolve()}'. Please check the path.")
	return

	print(f'Found {len(all_json_files)} JSON files. Extracting URLs...')
	unique_urls: set[str] = set()
	for json_file in async_tqdm(all_json_files, desc='Extracting URLs'):
	urls_in_file = await extract_urls_from_json_file(json_file)
	unique_urls.update(urls_in_file)
	if not unique_urls:
	print('No Discord CDN image URLs found in the provided JSON files.')
	return

	print(f'Found {len(unique_urls)} unique Discord CDN URLs. Starting parallel download with caching...')
	# print('URLS:')
	# for url in unique_urls:
	# print(f'URL: {url}')

	downloaded_count = 0
	failed_count = 0

	storage = AsyncSqliteStorage(database_path=str(cache_path / 'cache.db'))
	semaphore = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS)

	async with AsyncCacheClient(storage=storage) as client:
	tasks = [
	download_image_async(client, url, output_path, RETRY_ATTEMPTS, RETRY_DELAY, semaphore)
	for url in unique_urls
	]

	for future in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc='Downloading images'):
	result = await future
	if result:
	downloaded_count += 1
	elif result is None:
	failed_count += 1

	print('\n--- Download Summary ---')
	print(f'Total URLs processed: {len(unique_urls)}')
	print(f'Successfully downloaded (or retrieved from cache): {downloaded_count}')
	print(f'Failed downloads (including 404s): {failed_count}')
	print(f'Images saved to: {output_path.resolve()}')
	print(f'HTTP cache stored in: {cache_path.resolve()}')


	if __name__ == '__main__':
	asyncio.run(main())
No results found