Skip to content

Instantly share code, notes, and snippets.

@reservoirinvest
Created July 29, 2020 04:24
Show Gist options
  • Save reservoirinvest/8a9175194269aa0362adf329417f28dd to your computer and use it in GitHub Desktop.
Save reservoirinvest/8a9175194269aa0362adf329417f28dd to your computer and use it in GitHub Desktop.
asyncio
# Using tasks on create_task for better control of errors
# Ref: [EdgeDB](https://youtu.be/-CzqsgaXUM8?t=2279)
import asyncio
import time
from typing import Callable, Coroutine
import httpx
# Let us start by making a progress reporting async function.
addr = 'https://langa.pl/crawl'
async def progress(
url: str,
algo: Callable[..., Coroutine],
) -> None:
# we will store the create_task into a variable
task = asyncio.create_task(
algo(url),
name=url,
)
todo.add(task) # we will add task instead of line
start = time.time()
while len(todo):
# we will use asyncio.wait function that takes and collection of tasks
# ... our todo set is great for this
# ... and waits for them to complete
# ... unlike wait_for, asyncio.wait will not raise an exception
# ... it instead gives us two tasks, done and the ones which are pending
done, _pending = await asyncio.wait(todo, timeout=0.5)
# to clean up our todo set, we are removing the done task from it
todo.difference_update(done)
# we will report progress as things are going on
urls = (t.get_name() for t in todo)
# prints the current status
print(f"{len(todo)}: " + " ".join(sorted(urls))[-75:])
end = time.time()
print(f"Took {int(end-start)} seconds")
async def crawl3(
prefix: str, url: str = "",
) -> None:
url = url or prefix
client = httpx.AsyncClient()
try:
res = await client.get(url)
finally:
await client.aclose()
for line in res.text.splitlines():
if line.startswith(prefix):
task = asyncio.create_task(
crawl3(prefix, line),
name=line,
)
todo.add(task)
todo = set()
asyncio.run(progress(addr, crawl3))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment