KevinAlavik · January 9, 2025 11:53
diff --git a/scrapper.py b/scrapper.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, unquote
 import logging
 import colorlog
 from concurrent.futures import ThreadPoolExecutor
 from time import time
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

 # Logging setup
 handler = colorlog.StreamHandler()
 formatter = colorlog.ColoredFormatter(
    "%(log_color)s%(asctime)s - %(levelname)s - %(message)s",
    log_colors={
        'DEBUG': 'white',
        'INFO': 'cyan',
        'WARNING': 'yellow',
        'ERROR': 'red',
        'CRITICAL': 'bold_red',
    }
 )
 handler.setFormatter(formatter)
 logger = colorlog.getLogger(__name__)
 logger.addHandler(handler)
 logger.setLevel(logging.INFO)

 # Session setup
 session = requests.Session()
 retries = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504]
 )
 adapter = HTTPAdapter(max_retries=retries)
 session.mount("https://", adapter)
 session.mount("http://", adapter)

 # Configuration
 url = "https://myrient.erista.me/files/No-Intro/Nintendo%20-%20Super%20Nintendo%20Entertainment%20System/"
 search_extension = ".zip"
 region_check = True
 region = "Europe"
 world = True
 chunk_size = 8192 * 4
 download_dir = "out-snes/"
 os.makedirs(download_dir, exist_ok=True)

 # Pre-load completed files
 completed_files = set(os.listdir(download_dir))

 def fetch_links(target_url):
    """Fetch links from the given URL."""
    try:
        logger.info(f"🌐 Fetching page: {target_url}")
        response = session.get(target_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        return [
            urljoin(target_url, unquote(a_tag["href"]))
            for a_tag in soup.find_all("a", href=True)
            if a_tag["href"].endswith(search_extension)
        ]
    except requests.exceptions.RequestException as e:
        logger.error(f"❌ Error fetching page: {e}")
        return []

 def download_file(url):
    """Download a single file."""
    local_filename = os.path.join(download_dir, os.path.basename(url))
    if os.path.basename(local_filename) in completed_files:
        logger.info(f"✅ Already downloaded: {local_filename}")
        return

    if region_check and (region not in local_filename and not ("(World)" in local_filename and world)):
        logger.info(f"❌ Skipping file: {local_filename}, not for region '{region}'")
        return

    try:
        logger.info(f"⬇️ Starting download: {url}")
        with session.get(url, stream=True) as response:
            response.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size):
                    if chunk:
                        f.write(chunk)

        logger.info(f"✅ Download completed: {local_filename}")
        completed_files.add(os.path.basename(local_filename))
    except requests.exceptions.RequestException as e:
        logger.error(f"❌ Error downloading file {url}: {e}")

 def main():
    start_time = time()
    try:
        # Fetch all links
        links = fetch_links(url)
        if not links:
            logger.warning("⚠️ No links found!")
            return

        logger.info(f"🔗 Found {len(links)} links to process.")
        with ThreadPoolExecutor(max_workers=8) as executor:
            executor.map(download_file, links)
    except KeyboardInterrupt:
        logger.warning("❌ Download interrupted by user.")
    end_time = time()
    elapsed_time = end_time - start_time
    logger.info(f"⏱️ All downloads completed in {elapsed_time:.2f} seconds.")

 if __name__ == "__main__":
    main()
	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, unquote
	import logging
	import colorlog
	from concurrent.futures import ThreadPoolExecutor
	from time import time
	from requests.adapters import HTTPAdapter
	from urllib3.util.retry import Retry

	# Logging setup
	handler = colorlog.StreamHandler()
	formatter = colorlog.ColoredFormatter(
	"%(log_color)s%(asctime)s - %(levelname)s - %(message)s",
	log_colors={
	'DEBUG': 'white',
	'INFO': 'cyan',
	'WARNING': 'yellow',
	'ERROR': 'red',
	'CRITICAL': 'bold_red',
	}
	)
	handler.setFormatter(formatter)
	logger = colorlog.getLogger(__name__)
	logger.addHandler(handler)
	logger.setLevel(logging.INFO)

	# Session setup
	session = requests.Session()
	retries = Retry(
	total=5,
	backoff_factor=1,
	status_forcelist=[500, 502, 503, 504]
	)
	adapter = HTTPAdapter(max_retries=retries)
	session.mount("https://", adapter)
	session.mount("http://", adapter)

	# Configuration
	url = "https://myrient.erista.me/files/No-Intro/Nintendo%20-%20Super%20Nintendo%20Entertainment%20System/"
	search_extension = ".zip"
	region_check = True
	region = "Europe"
	world = True
	chunk_size = 8192 * 4
	download_dir = "out-snes/"
	os.makedirs(download_dir, exist_ok=True)

	# Pre-load completed files
	completed_files = set(os.listdir(download_dir))

	def fetch_links(target_url):
	"""Fetch links from the given URL."""
	try:
	logger.info(f"🌐 Fetching page: {target_url}")
	response = session.get(target_url)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')
	return [
	urljoin(target_url, unquote(a_tag["href"]))
	for a_tag in soup.find_all("a", href=True)
	if a_tag["href"].endswith(search_extension)
	]
	except requests.exceptions.RequestException as e:
	logger.error(f"❌ Error fetching page: {e}")
	return []

	def download_file(url):
	"""Download a single file."""
	local_filename = os.path.join(download_dir, os.path.basename(url))
	if os.path.basename(local_filename) in completed_files:
	logger.info(f"✅ Already downloaded: {local_filename}")
	return

	if region_check and (region not in local_filename and not ("(World)" in local_filename and world)):
	logger.info(f"❌ Skipping file: {local_filename}, not for region '{region}'")
	return

	try:
	logger.info(f"⬇️ Starting download: {url}")
	with session.get(url, stream=True) as response:
	response.raise_for_status()
	with open(local_filename, 'wb') as f:
	for chunk in response.iter_content(chunk_size):
	if chunk:
	f.write(chunk)

	logger.info(f"✅ Download completed: {local_filename}")
	completed_files.add(os.path.basename(local_filename))
	except requests.exceptions.RequestException as e:
	logger.error(f"❌ Error downloading file {url}: {e}")

	def main():
	start_time = time()
	try:
	# Fetch all links
	links = fetch_links(url)
	if not links:
	logger.warning("⚠️ No links found!")
	return

	logger.info(f"🔗 Found {len(links)} links to process.")
	with ThreadPoolExecutor(max_workers=8) as executor:
	executor.map(download_file, links)
	except KeyboardInterrupt:
	logger.warning("❌ Download interrupted by user.")
	end_time = time()
	elapsed_time = end_time - start_time
	logger.info(f"⏱️ All downloads completed in {elapsed_time:.2f} seconds.")

	if __name__ == "__main__":
	main()