Created
July 12, 2024 02:09
-
-
Save meetox80/81dc4449a9f5382b9861a912e8d8f357 to your computer and use it in GitHub Desktop.
abbottabad-compound scraper (alternative to zip dl)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from tqdm import tqdm | |
import concurrent.futures | |
urls = [ | |
"https://www.cia.gov/library/abbottabad-compound/index_images.html", | |
"https://www.cia.gov/library/abbottabad-compound/index_converted_documents.html", | |
"https://www.cia.gov/library/abbottabad-compound/index_audio.html", | |
"https://www.cia.gov/library/abbottabad-compound/index_video.html", | |
] | |
base_url = "https://www.cia.gov/library/abbottabad-compound/" | |
def download_files_from_url(url, save_dir): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
table_rows = soup.find_all('tr') | |
total_size = 0 | |
for row in tqdm(table_rows[1:], desc=f"Downloading files from {url}"): # Skipping the header row | |
cells = row.find_all('td') | |
if len(cells) < 2: | |
continue | |
file_identifier = cells[0].text.strip() | |
file_link = cells[1].find('a')['href'] | |
file_url = base_url + file_link[1:] | |
file_name = file_link.split('/')[-1] | |
try: | |
file_response = requests.get(file_url, stream=True) | |
file_response.raise_for_status() | |
file_size = int(file_response.headers.get('content-length', 0)) | |
total_size += file_size | |
file_path = os.path.join(save_dir, file_name) | |
os.makedirs(save_dir, exist_ok=True) | |
with open(file_path, 'wb') as file, tqdm( | |
desc=file_name, | |
total=file_size, | |
unit='B', | |
unit_scale=True, | |
unit_divisor=1024, | |
leave=False | |
) as bar: | |
for chunk in file_response.iter_content(chunk_size=8192): | |
file.write(chunk) | |
bar.update(len(chunk)) | |
except requests.exceptions.HTTPError as e: | |
if e.response.status_code == 403: | |
print(f"Skipping {file_name} due to 403 Forbidden error.") | |
continue | |
else: | |
raise e | |
return total_size | |
except Exception as e: | |
print(f"An error occurred while downloading files from {url}: {e}") | |
return 0 | |
def handle_download(url): | |
save_dir = f"./bin/{os.path.basename(url)}" | |
return download_files_from_url(url, save_dir) | |
total_downloaded_size = 0 | |
with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor: | |
futures = {executor.submit(handle_download, url): url for url in urls} | |
for future in concurrent.futures.as_completed(futures): | |
total_downloaded_size += future.result() | |
total_downloaded_size_gb = total_downloaded_size / (1024**3) | |
print(f"Total downloaded size: {total_downloaded_size_gb:.2f} GB") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment