mal1kc · January 18, 2025 15:43
diff --git a/website_downloader.py b/website_downloader.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 from concurrent import futures as cfutures


 def download_file(url, base_folder):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors

        # Parse the URL to get the path
        parsed_url = urlparse(url)
        file_path = os.path.join(
            base_folder, parsed_url.netloc, parsed_url.path.lstrip("/")
        )

        # Create the directory if it doesn't exist
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        with open(file_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded: {file_path}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")


 def download_static_files(base_url, base_folder="downloaded_site"):
    # Create a base folder to store downloaded files
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    # Get the HTML content of the base URL
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all static files (images, CSS, JS) and HTML pages
        static_files = []
        html_pages = []

        # Find images
        for img in soup.find_all("img"):
            img_url = urljoin(base_url, img.get("src"))
            static_files.append(img_url)

        # Find CSS files
        for link in soup.find_all("link", rel="stylesheet"):
            css_url = urljoin(base_url, link.get("href"))
            static_files.append(css_url)

        # Find JavaScript files
        for script in soup.find_all("script"):
            js_url = urljoin(base_url, script.get("src"))
            if js_url:
                static_files.append(js_url)

        # Find other HTML pages
        for a in soup.find_all("a", href=True):
            page_url = urljoin(base_url, a["href"])
            if urlparse(page_url).netloc == urlparse(base_url).netloc:  # Same domain
                html_pages.append(page_url)

        with cfutures.ThreadPoolExecutor() as executor:
            # Download all static files
            result_futures = []
            for file_url in static_files:
                result_futures.append(
                    executor.submit(download_file, file_url, base_folder)
                )

            # Download all HTML pages
            for page_url in html_pages:
                result_futures.append(
                    executor.submit(download_file, page_url, base_folder)
                )

            for future in cfutures.as_completed(result_futures):
                try:
                    future.result()  # This will raise any exceptions caught during download
                except Exception as e:
                    print(f"Error occurred: {e}")

    except Exception as e:
        print(f"Failed to retrieve {base_url}: {e}")


 if __name__ == "__main__":
    base_url = input("Enter the base website URL: ")
    download_static_files(base_url)
	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from concurrent import futures as cfutures


	def download_file(url, base_folder):
	try:
	response = requests.get(url)
	response.raise_for_status() # Check for HTTP errors

	# Parse the URL to get the path
	parsed_url = urlparse(url)
	file_path = os.path.join(
	base_folder, parsed_url.netloc, parsed_url.path.lstrip("/")
	)

	# Create the directory if it doesn't exist
	os.makedirs(os.path.dirname(file_path), exist_ok=True)

	with open(file_path, "wb") as f:
	f.write(response.content)
	print(f"Downloaded: {file_path}")
	except Exception as e:
	print(f"Failed to download {url}: {e}")


	def download_static_files(base_url, base_folder="downloaded_site"):
	# Create a base folder to store downloaded files
	if not os.path.exists(base_folder):
	os.makedirs(base_folder)

	# Get the HTML content of the base URL
	try:
	response = requests.get(base_url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	# Find all static files (images, CSS, JS) and HTML pages
	static_files = []
	html_pages = []

	# Find images
	for img in soup.find_all("img"):
	img_url = urljoin(base_url, img.get("src"))
	static_files.append(img_url)

	# Find CSS files
	for link in soup.find_all("link", rel="stylesheet"):
	css_url = urljoin(base_url, link.get("href"))
	static_files.append(css_url)

	# Find JavaScript files
	for script in soup.find_all("script"):
	js_url = urljoin(base_url, script.get("src"))
	if js_url:
	static_files.append(js_url)

	# Find other HTML pages
	for a in soup.find_all("a", href=True):
	page_url = urljoin(base_url, a["href"])
	if urlparse(page_url).netloc == urlparse(base_url).netloc: # Same domain
	html_pages.append(page_url)

	with cfutures.ThreadPoolExecutor() as executor:
	# Download all static files
	result_futures = []
	for file_url in static_files:
	result_futures.append(
	executor.submit(download_file, file_url, base_folder)
	)

	# Download all HTML pages
	for page_url in html_pages:
	result_futures.append(
	executor.submit(download_file, page_url, base_folder)
	)

	for future in cfutures.as_completed(result_futures):
	try:
	future.result() # This will raise any exceptions caught during download
	except Exception as e:
	print(f"Error occurred: {e}")

	except Exception as e:
	print(f"Failed to retrieve {base_url}: {e}")


	if __name__ == "__main__":
	base_url = input("Enter the base website URL: ")
	download_static_files(base_url)