pingzh · October 27, 2024 05:51
diff --git a/PoolExecutorVsAsyncio.py b/PoolExecutorVsAsyncio.py
 import sys
 import asyncio
 import time
 import httpx
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from concurrent.futures import ThreadPoolExecutor, Executor, ProcessPoolExecutor


 def get_links():
    countries_list = (
        "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
    )
    all_links = []
    response = httpx.get(countries_list)
    soup = BeautifulSoup(response.text, "lxml")
    countries_el = soup.select("td .flagicon+ a")
    for link_el in countries_el:
        link = link_el.get("href")
        link = urljoin(countries_list, link)
        all_links.append(link)
    return all_links


 def fetch_sync(args):
    try:
        link, http_client = args
        if http_client:
            response = http_client.get(link)
        else:  # ProcessPoolExecutor
            response = httpx.get(link)
        with open(link.split("/")[-1] + ".html", "wb") as f:
            f.write(response.content)
    except Exception as e:
        print(e)


 async def fetch_async(
    link: str,
    http_client: httpx.AsyncClient,
    semaphore: asyncio.Semaphore,
 ):
    async with semaphore:
        response = await http_client.get(link)

    with open(link.split("/")[-1] + ".html", "wb") as f:
        f.write(response.content)


 async def benchmark_async():
    started_at = time.time()
    all_links = get_links()
    s2 = time.time()
    print(f"Took {s2 - started_at} seconds to get all links")

    semaphore = asyncio.Semaphore(10)

    async with httpx.AsyncClient() as client:
        tasks = [
            asyncio.create_task(
                fetch_async(link, semaphore=semaphore, http_client=client)
            )
            for link in all_links
        ]
        await asyncio.gather(*tasks)

    ended_at = time.time()
    print(
        f"Finish downloading {len(all_links)} links using asyncio with {ended_at - s2} seconds"
    )


 def benchmark(cls_executor: Executor):
    started_at = time.time()
    all_links = get_links()
    s2 = time.time()
    print(f"Took {s2 - started_at} seconds to get all links")

    if cls_executor == ThreadPoolExecutor:
        with httpx.Client() as client:
            with cls_executor(max_workers=10) as pool:
                args = [(link, client) for link in all_links]
                pool.map(fetch_sync, args)
    else:
        with cls_executor(max_workers=10) as pool:
            args = [(link, None) for link in all_links]
            pool.map(fetch_sync, args)

    ended_at = time.time()
    print(
        f"Finish downloading {len(all_links)} links using {cls_executor} with {ended_at - s2} seconds"
    )


 if __name__ == "__main__":
    """
    For ThreadPool and asyncio, since the httpx client is shared, it is much faster than ProcessPool
    See https://www.python-httpx.org/advanced/clients/#why-use-a-client


    Without share client: asyncio is way slower compared to ThreadPool, likely due to the overhead to creat http connection

    # With shared client: the perf of ThreadPool and asyncio is very close, ThreadPool is still faster
    """
    if len(sys.argv) <= 1:
        benchmark(ThreadPoolExecutor)
        time.sleep(2)
        benchmark(ProcessPoolExecutor)
        time.sleep(2)
        asyncio.run(benchmark_async())
    else:
        mode = sys.argv[1]
        if mode == "t":
            benchmark(ThreadPoolExecutor)
        elif mode == "p":
            benchmark(ProcessPoolExecutor)
        elif mode == "a":
            asyncio.run(benchmark_async())
	import sys
	import asyncio
	import time
	import httpx
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from concurrent.futures import ThreadPoolExecutor, Executor, ProcessPoolExecutor


	def get_links():
	countries_list = (
	"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
	)
	all_links = []
	response = httpx.get(countries_list)
	soup = BeautifulSoup(response.text, "lxml")
	countries_el = soup.select("td .flagicon+ a")
	for link_el in countries_el:
	link = link_el.get("href")
	link = urljoin(countries_list, link)
	all_links.append(link)
	return all_links


	def fetch_sync(args):
	try:
	link, http_client = args
	if http_client:
	response = http_client.get(link)
	else: # ProcessPoolExecutor
	response = httpx.get(link)
	with open(link.split("/")[-1] + ".html", "wb") as f:
	f.write(response.content)
	except Exception as e:
	print(e)


	async def fetch_async(
	link: str,
	http_client: httpx.AsyncClient,
	semaphore: asyncio.Semaphore,
	):
	async with semaphore:
	response = await http_client.get(link)

	with open(link.split("/")[-1] + ".html", "wb") as f:
	f.write(response.content)


	async def benchmark_async():
	started_at = time.time()
	all_links = get_links()
	s2 = time.time()
	print(f"Took {s2 - started_at} seconds to get all links")

	semaphore = asyncio.Semaphore(10)

	async with httpx.AsyncClient() as client:
	tasks = [
	asyncio.create_task(
	fetch_async(link, semaphore=semaphore, http_client=client)
	)
	for link in all_links
	]
	await asyncio.gather(*tasks)

	ended_at = time.time()
	print(
	f"Finish downloading {len(all_links)} links using asyncio with {ended_at - s2} seconds"
	)


	def benchmark(cls_executor: Executor):
	started_at = time.time()
	all_links = get_links()
	s2 = time.time()
	print(f"Took {s2 - started_at} seconds to get all links")

	if cls_executor == ThreadPoolExecutor:
	with httpx.Client() as client:
	with cls_executor(max_workers=10) as pool:
	args = [(link, client) for link in all_links]
	pool.map(fetch_sync, args)
	else:
	with cls_executor(max_workers=10) as pool:
	args = [(link, None) for link in all_links]
	pool.map(fetch_sync, args)

	ended_at = time.time()
	print(
	f"Finish downloading {len(all_links)} links using {cls_executor} with {ended_at - s2} seconds"
	)


	if __name__ == "__main__":
	"""
	For ThreadPool and asyncio, since the httpx client is shared, it is much faster than ProcessPool
	See https://www.python-httpx.org/advanced/clients/#why-use-a-client


	Without share client: asyncio is way slower compared to ThreadPool, likely due to the overhead to creat http connection

	# With shared client: the perf of ThreadPool and asyncio is very close, ThreadPool is still faster
	"""
	if len(sys.argv) <= 1:
	benchmark(ThreadPoolExecutor)
	time.sleep(2)
	benchmark(ProcessPoolExecutor)
	time.sleep(2)
	asyncio.run(benchmark_async())
	else:
	mode = sys.argv[1]
	if mode == "t":
	benchmark(ThreadPoolExecutor)
	elif mode == "p":
	benchmark(ProcessPoolExecutor)
	elif mode == "a":
	asyncio.run(benchmark_async())