Skip to content

Instantly share code, notes, and snippets.

@meetox80
Created July 12, 2024 02:09
Show Gist options
  • Save meetox80/81dc4449a9f5382b9861a912e8d8f357 to your computer and use it in GitHub Desktop.
Save meetox80/81dc4449a9f5382b9861a912e8d8f357 to your computer and use it in GitHub Desktop.
abbottabad-compound scraper (alternative to zip dl)
import requests
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import concurrent.futures
urls = [
"https://www.cia.gov/library/abbottabad-compound/index_images.html",
"https://www.cia.gov/library/abbottabad-compound/index_converted_documents.html",
"https://www.cia.gov/library/abbottabad-compound/index_audio.html",
"https://www.cia.gov/library/abbottabad-compound/index_video.html",
]
base_url = "https://www.cia.gov/library/abbottabad-compound/"
def download_files_from_url(url, save_dir):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
table_rows = soup.find_all('tr')
total_size = 0
for row in tqdm(table_rows[1:], desc=f"Downloading files from {url}"): # Skipping the header row
cells = row.find_all('td')
if len(cells) < 2:
continue
file_identifier = cells[0].text.strip()
file_link = cells[1].find('a')['href']
file_url = base_url + file_link[1:]
file_name = file_link.split('/')[-1]
try:
file_response = requests.get(file_url, stream=True)
file_response.raise_for_status()
file_size = int(file_response.headers.get('content-length', 0))
total_size += file_size
file_path = os.path.join(save_dir, file_name)
os.makedirs(save_dir, exist_ok=True)
with open(file_path, 'wb') as file, tqdm(
desc=file_name,
total=file_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
leave=False
) as bar:
for chunk in file_response.iter_content(chunk_size=8192):
file.write(chunk)
bar.update(len(chunk))
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
print(f"Skipping {file_name} due to 403 Forbidden error.")
continue
else:
raise e
return total_size
except Exception as e:
print(f"An error occurred while downloading files from {url}: {e}")
return 0
def handle_download(url):
save_dir = f"./bin/{os.path.basename(url)}"
return download_files_from_url(url, save_dir)
total_downloaded_size = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor:
futures = {executor.submit(handle_download, url): url for url in urls}
for future in concurrent.futures.as_completed(futures):
total_downloaded_size += future.result()
total_downloaded_size_gb = total_downloaded_size / (1024**3)
print(f"Total downloaded size: {total_downloaded_size_gb:.2f} GB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment