Skip to content

Instantly share code, notes, and snippets.

@pingzh
Created October 27, 2024 05:51
Show Gist options
  • Save pingzh/10ee68e3011accc23453280dc4560872 to your computer and use it in GitHub Desktop.
Save pingzh/10ee68e3011accc23453280dc4560872 to your computer and use it in GitHub Desktop.
import sys
import asyncio
import time
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, Executor, ProcessPoolExecutor
def get_links():
countries_list = (
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
)
all_links = []
response = httpx.get(countries_list)
soup = BeautifulSoup(response.text, "lxml")
countries_el = soup.select("td .flagicon+ a")
for link_el in countries_el:
link = link_el.get("href")
link = urljoin(countries_list, link)
all_links.append(link)
return all_links
def fetch_sync(args):
try:
link, http_client = args
if http_client:
response = http_client.get(link)
else: # ProcessPoolExecutor
response = httpx.get(link)
with open(link.split("/")[-1] + ".html", "wb") as f:
f.write(response.content)
except Exception as e:
print(e)
async def fetch_async(
link: str,
http_client: httpx.AsyncClient,
semaphore: asyncio.Semaphore,
):
async with semaphore:
response = await http_client.get(link)
with open(link.split("/")[-1] + ".html", "wb") as f:
f.write(response.content)
async def benchmark_async():
started_at = time.time()
all_links = get_links()
s2 = time.time()
print(f"Took {s2 - started_at} seconds to get all links")
semaphore = asyncio.Semaphore(10)
async with httpx.AsyncClient() as client:
tasks = [
asyncio.create_task(
fetch_async(link, semaphore=semaphore, http_client=client)
)
for link in all_links
]
await asyncio.gather(*tasks)
ended_at = time.time()
print(
f"Finish downloading {len(all_links)} links using asyncio with {ended_at - s2} seconds"
)
def benchmark(cls_executor: Executor):
started_at = time.time()
all_links = get_links()
s2 = time.time()
print(f"Took {s2 - started_at} seconds to get all links")
if cls_executor == ThreadPoolExecutor:
with httpx.Client() as client:
with cls_executor(max_workers=10) as pool:
args = [(link, client) for link in all_links]
pool.map(fetch_sync, args)
else:
with cls_executor(max_workers=10) as pool:
args = [(link, None) for link in all_links]
pool.map(fetch_sync, args)
ended_at = time.time()
print(
f"Finish downloading {len(all_links)} links using {cls_executor} with {ended_at - s2} seconds"
)
if __name__ == "__main__":
"""
For ThreadPool and asyncio, since the httpx client is shared, it is much faster than ProcessPool
See https://www.python-httpx.org/advanced/clients/#why-use-a-client
Without share client: asyncio is way slower compared to ThreadPool, likely due to the overhead to creat http connection
# With shared client: the perf of ThreadPool and asyncio is very close, ThreadPool is still faster
"""
if len(sys.argv) <= 1:
benchmark(ThreadPoolExecutor)
time.sleep(2)
benchmark(ProcessPoolExecutor)
time.sleep(2)
asyncio.run(benchmark_async())
else:
mode = sys.argv[1]
if mode == "t":
benchmark(ThreadPoolExecutor)
elif mode == "p":
benchmark(ProcessPoolExecutor)
elif mode == "a":
asyncio.run(benchmark_async())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment