Skip to content

Instantly share code, notes, and snippets.

@shortthirdman
Created March 25, 2025 11:42
Show Gist options
  • Save shortthirdman/abaef77e521d19186be2c82f15ebdfc9 to your computer and use it in GitHub Desktop.
Save shortthirdman/abaef77e521d19186be2c82f15ebdfc9 to your computer and use it in GitHub Desktop.
AsyncScraper
import asyncio
import aiohttp
import random
from bs4 import BeautifulSoup
class AsyncScraper:
def __init__(self, max_concurrency=20):
self.semaphore = asyncio.Semaphore(max_concurrency)
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15",
# Add more user agents
]
async def fetch(self, session, url):
async with self.semaphore:
# Add random delay to mimic human behavior
await asyncio.sleep(random.uniform(0.5, 3))
headers = {
"User-Agent": random.choice(self.user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml",
"Accept-Language": "en-US,en;q=0.9"
}
try:
async with session.get(url, headers=headers) as response:
if response.status == 200:
return await response.text()
elif response.status == 429: # Too Many Requests
# Implement backoff strategy
await asyncio.sleep(30)
return await self.fetch(session, url)
else:
print(f"Error: {response.status} on {url}")
return None
except Exception as e:
print(f"Exception during fetch: {e}")
return None
async def parse(self, html):
# Implement parsing logic
soup = BeautifulSoup(html, 'html.parser')
# Extract data from soup
return data
async def process_url(self, session, url):
html = await self.fetch(session, url)
if html:
data = await self.parse(html)
await self.save(data)
async def save(self, data):
# Implement saving logic (database, file, etc.)
async with aiofiles.open('results.json', 'a') as f:
await f.write(json.dumps(data) + '\n')
async def run(self, urls):
async with aiohttp.ClientSession() as session:
tasks = [self.process_url(session, url) for url in urls]
await asyncio.gather(*tasks)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment