Created
October 27, 2024 05:51
-
-
Save pingzh/10ee68e3011accc23453280dc4560872 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import asyncio | |
import time | |
import httpx | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from concurrent.futures import ThreadPoolExecutor, Executor, ProcessPoolExecutor | |
def get_links(): | |
countries_list = ( | |
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)" | |
) | |
all_links = [] | |
response = httpx.get(countries_list) | |
soup = BeautifulSoup(response.text, "lxml") | |
countries_el = soup.select("td .flagicon+ a") | |
for link_el in countries_el: | |
link = link_el.get("href") | |
link = urljoin(countries_list, link) | |
all_links.append(link) | |
return all_links | |
def fetch_sync(args): | |
try: | |
link, http_client = args | |
if http_client: | |
response = http_client.get(link) | |
else: # ProcessPoolExecutor | |
response = httpx.get(link) | |
with open(link.split("/")[-1] + ".html", "wb") as f: | |
f.write(response.content) | |
except Exception as e: | |
print(e) | |
async def fetch_async( | |
link: str, | |
http_client: httpx.AsyncClient, | |
semaphore: asyncio.Semaphore, | |
): | |
async with semaphore: | |
response = await http_client.get(link) | |
with open(link.split("/")[-1] + ".html", "wb") as f: | |
f.write(response.content) | |
async def benchmark_async(): | |
started_at = time.time() | |
all_links = get_links() | |
s2 = time.time() | |
print(f"Took {s2 - started_at} seconds to get all links") | |
semaphore = asyncio.Semaphore(10) | |
async with httpx.AsyncClient() as client: | |
tasks = [ | |
asyncio.create_task( | |
fetch_async(link, semaphore=semaphore, http_client=client) | |
) | |
for link in all_links | |
] | |
await asyncio.gather(*tasks) | |
ended_at = time.time() | |
print( | |
f"Finish downloading {len(all_links)} links using asyncio with {ended_at - s2} seconds" | |
) | |
def benchmark(cls_executor: Executor): | |
started_at = time.time() | |
all_links = get_links() | |
s2 = time.time() | |
print(f"Took {s2 - started_at} seconds to get all links") | |
if cls_executor == ThreadPoolExecutor: | |
with httpx.Client() as client: | |
with cls_executor(max_workers=10) as pool: | |
args = [(link, client) for link in all_links] | |
pool.map(fetch_sync, args) | |
else: | |
with cls_executor(max_workers=10) as pool: | |
args = [(link, None) for link in all_links] | |
pool.map(fetch_sync, args) | |
ended_at = time.time() | |
print( | |
f"Finish downloading {len(all_links)} links using {cls_executor} with {ended_at - s2} seconds" | |
) | |
if __name__ == "__main__": | |
""" | |
For ThreadPool and asyncio, since the httpx client is shared, it is much faster than ProcessPool | |
See https://www.python-httpx.org/advanced/clients/#why-use-a-client | |
Without share client: asyncio is way slower compared to ThreadPool, likely due to the overhead to creat http connection | |
# With shared client: the perf of ThreadPool and asyncio is very close, ThreadPool is still faster | |
""" | |
if len(sys.argv) <= 1: | |
benchmark(ThreadPoolExecutor) | |
time.sleep(2) | |
benchmark(ProcessPoolExecutor) | |
time.sleep(2) | |
asyncio.run(benchmark_async()) | |
else: | |
mode = sys.argv[1] | |
if mode == "t": | |
benchmark(ThreadPoolExecutor) | |
elif mode == "p": | |
benchmark(ProcessPoolExecutor) | |
elif mode == "a": | |
asyncio.run(benchmark_async()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment