Skip to content

Instantly share code, notes, and snippets.

@mal1kc
Created January 18, 2025 15:43
Show Gist options
  • Save mal1kc/88263196855411ada6ec513d080e2c33 to your computer and use it in GitHub Desktop.
Save mal1kc/88263196855411ada6ec513d080e2c33 to your computer and use it in GitHub Desktop.
static website_downloader
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent import futures as cfutures
def download_file(url, base_folder):
try:
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
# Parse the URL to get the path
parsed_url = urlparse(url)
file_path = os.path.join(
base_folder, parsed_url.netloc, parsed_url.path.lstrip("/")
)
# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(response.content)
print(f"Downloaded: {file_path}")
except Exception as e:
print(f"Failed to download {url}: {e}")
def download_static_files(base_url, base_folder="downloaded_site"):
# Create a base folder to store downloaded files
if not os.path.exists(base_folder):
os.makedirs(base_folder)
# Get the HTML content of the base URL
try:
response = requests.get(base_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Find all static files (images, CSS, JS) and HTML pages
static_files = []
html_pages = []
# Find images
for img in soup.find_all("img"):
img_url = urljoin(base_url, img.get("src"))
static_files.append(img_url)
# Find CSS files
for link in soup.find_all("link", rel="stylesheet"):
css_url = urljoin(base_url, link.get("href"))
static_files.append(css_url)
# Find JavaScript files
for script in soup.find_all("script"):
js_url = urljoin(base_url, script.get("src"))
if js_url:
static_files.append(js_url)
# Find other HTML pages
for a in soup.find_all("a", href=True):
page_url = urljoin(base_url, a["href"])
if urlparse(page_url).netloc == urlparse(base_url).netloc: # Same domain
html_pages.append(page_url)
with cfutures.ThreadPoolExecutor() as executor:
# Download all static files
result_futures = []
for file_url in static_files:
result_futures.append(
executor.submit(download_file, file_url, base_folder)
)
# Download all HTML pages
for page_url in html_pages:
result_futures.append(
executor.submit(download_file, page_url, base_folder)
)
for future in cfutures.as_completed(result_futures):
try:
future.result() # This will raise any exceptions caught during download
except Exception as e:
print(f"Error occurred: {e}")
except Exception as e:
print(f"Failed to retrieve {base_url}: {e}")
if __name__ == "__main__":
base_url = input("Enter the base website URL: ")
download_static_files(base_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment