meetox80 · July 12, 2024 02:09
diff --git a/get.py b/get.py
 import requests
 from bs4 import BeautifulSoup
 import os
 from tqdm import tqdm
 import concurrent.futures

 urls = [
    "https://www.cia.gov/library/abbottabad-compound/index_images.html",
    "https://www.cia.gov/library/abbottabad-compound/index_converted_documents.html",
    "https://www.cia.gov/library/abbottabad-compound/index_audio.html",
    "https://www.cia.gov/library/abbottabad-compound/index_video.html",
 ]

 base_url = "https://www.cia.gov/library/abbottabad-compound/"

 def download_files_from_url(url, save_dir):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        table_rows = soup.find_all('tr')
        total_size = 0
        
        for row in tqdm(table_rows[1:], desc=f"Downloading files from {url}"):  # Skipping the header row
            cells = row.find_all('td')

            if len(cells) < 2:
                continue

            file_identifier = cells[0].text.strip()
            file_link = cells[1].find('a')['href']
            file_url = base_url + file_link[1:]
            file_name = file_link.split('/')[-1]

            try:
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status()
                file_size = int(file_response.headers.get('content-length', 0))
                total_size += file_size

                file_path = os.path.join(save_dir, file_name)
                os.makedirs(save_dir, exist_ok=True)

                with open(file_path, 'wb') as file, tqdm(
                    desc=file_name,
                    total=file_size,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                    leave=False
                ) as bar:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        file.write(chunk)
                        bar.update(len(chunk))
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    print(f"Skipping {file_name} due to 403 Forbidden error.")
                    continue
                else:
                    raise e

        return total_size
    except Exception as e:
        print(f"An error occurred while downloading files from {url}: {e}")
        return 0

 def handle_download(url):
    save_dir = f"./bin/{os.path.basename(url)}"
    return download_files_from_url(url, save_dir)

 total_downloaded_size = 0
 with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor:
    futures = {executor.submit(handle_download, url): url for url in urls}
    
    for future in concurrent.futures.as_completed(futures):
        total_downloaded_size += future.result()

 total_downloaded_size_gb = total_downloaded_size / (1024**3)
 print(f"Total downloaded size: {total_downloaded_size_gb:.2f} GB")
	import requests
	from bs4 import BeautifulSoup
	import os
	from tqdm import tqdm
	import concurrent.futures

	urls = [
	"https://www.cia.gov/library/abbottabad-compound/index_images.html",
	"https://www.cia.gov/library/abbottabad-compound/index_converted_documents.html",
	"https://www.cia.gov/library/abbottabad-compound/index_audio.html",
	"https://www.cia.gov/library/abbottabad-compound/index_video.html",
	]

	base_url = "https://www.cia.gov/library/abbottabad-compound/"

	def download_files_from_url(url, save_dir):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	table_rows = soup.find_all('tr')
	total_size = 0

	for row in tqdm(table_rows[1:], desc=f"Downloading files from {url}"): # Skipping the header row
	cells = row.find_all('td')

	if len(cells) < 2:
	continue

	file_identifier = cells[0].text.strip()
	file_link = cells[1].find('a')['href']
	file_url = base_url + file_link[1:]
	file_name = file_link.split('/')[-1]

	try:
	file_response = requests.get(file_url, stream=True)
	file_response.raise_for_status()
	file_size = int(file_response.headers.get('content-length', 0))
	total_size += file_size

	file_path = os.path.join(save_dir, file_name)
	os.makedirs(save_dir, exist_ok=True)

	with open(file_path, 'wb') as file, tqdm(
	desc=file_name,
	total=file_size,
	unit='B',
	unit_scale=True,
	unit_divisor=1024,
	leave=False
	) as bar:
	for chunk in file_response.iter_content(chunk_size=8192):
	file.write(chunk)
	bar.update(len(chunk))
	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403:
	print(f"Skipping {file_name} due to 403 Forbidden error.")
	continue
	else:
	raise e

	return total_size
	except Exception as e:
	print(f"An error occurred while downloading files from {url}: {e}")
	return 0

	def handle_download(url):
	save_dir = f"./bin/{os.path.basename(url)}"
	return download_files_from_url(url, save_dir)

	total_downloaded_size = 0
	with concurrent.futures.ThreadPoolExecutor(max_workers=len(urls)) as executor:
	futures = {executor.submit(handle_download, url): url for url in urls}

	for future in concurrent.futures.as_completed(futures):
	total_downloaded_size += future.result()

	total_downloaded_size_gb = total_downloaded_size / (1024**3)
	print(f"Total downloaded size: {total_downloaded_size_gb:.2f} GB")