Created
January 18, 2025 15:43
-
-
Save mal1kc/88263196855411ada6ec513d080e2c33 to your computer and use it in GitHub Desktop.
static website_downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
from concurrent import futures as cfutures | |
def download_file(url, base_folder): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() # Check for HTTP errors | |
# Parse the URL to get the path | |
parsed_url = urlparse(url) | |
file_path = os.path.join( | |
base_folder, parsed_url.netloc, parsed_url.path.lstrip("/") | |
) | |
# Create the directory if it doesn't exist | |
os.makedirs(os.path.dirname(file_path), exist_ok=True) | |
with open(file_path, "wb") as f: | |
f.write(response.content) | |
print(f"Downloaded: {file_path}") | |
except Exception as e: | |
print(f"Failed to download {url}: {e}") | |
def download_static_files(base_url, base_folder="downloaded_site"): | |
# Create a base folder to store downloaded files | |
if not os.path.exists(base_folder): | |
os.makedirs(base_folder) | |
# Get the HTML content of the base URL | |
try: | |
response = requests.get(base_url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Find all static files (images, CSS, JS) and HTML pages | |
static_files = [] | |
html_pages = [] | |
# Find images | |
for img in soup.find_all("img"): | |
img_url = urljoin(base_url, img.get("src")) | |
static_files.append(img_url) | |
# Find CSS files | |
for link in soup.find_all("link", rel="stylesheet"): | |
css_url = urljoin(base_url, link.get("href")) | |
static_files.append(css_url) | |
# Find JavaScript files | |
for script in soup.find_all("script"): | |
js_url = urljoin(base_url, script.get("src")) | |
if js_url: | |
static_files.append(js_url) | |
# Find other HTML pages | |
for a in soup.find_all("a", href=True): | |
page_url = urljoin(base_url, a["href"]) | |
if urlparse(page_url).netloc == urlparse(base_url).netloc: # Same domain | |
html_pages.append(page_url) | |
with cfutures.ThreadPoolExecutor() as executor: | |
# Download all static files | |
result_futures = [] | |
for file_url in static_files: | |
result_futures.append( | |
executor.submit(download_file, file_url, base_folder) | |
) | |
# Download all HTML pages | |
for page_url in html_pages: | |
result_futures.append( | |
executor.submit(download_file, page_url, base_folder) | |
) | |
for future in cfutures.as_completed(result_futures): | |
try: | |
future.result() # This will raise any exceptions caught during download | |
except Exception as e: | |
print(f"Error occurred: {e}") | |
except Exception as e: | |
print(f"Failed to retrieve {base_url}: {e}") | |
if __name__ == "__main__": | |
base_url = input("Enter the base website URL: ") | |
download_static_files(base_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment