Created
March 25, 2025 11:42
-
-
Save shortthirdman/abaef77e521d19186be2c82f15ebdfc9 to your computer and use it in GitHub Desktop.
AsyncScraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import aiohttp | |
import random | |
from bs4 import BeautifulSoup | |
class AsyncScraper: | |
def __init__(self, max_concurrency=20): | |
self.semaphore = asyncio.Semaphore(max_concurrency) | |
self.user_agents = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15", | |
# Add more user agents | |
] | |
async def fetch(self, session, url): | |
async with self.semaphore: | |
# Add random delay to mimic human behavior | |
await asyncio.sleep(random.uniform(0.5, 3)) | |
headers = { | |
"User-Agent": random.choice(self.user_agents), | |
"Accept": "text/html,application/xhtml+xml,application/xml", | |
"Accept-Language": "en-US,en;q=0.9" | |
} | |
try: | |
async with session.get(url, headers=headers) as response: | |
if response.status == 200: | |
return await response.text() | |
elif response.status == 429: # Too Many Requests | |
# Implement backoff strategy | |
await asyncio.sleep(30) | |
return await self.fetch(session, url) | |
else: | |
print(f"Error: {response.status} on {url}") | |
return None | |
except Exception as e: | |
print(f"Exception during fetch: {e}") | |
return None | |
async def parse(self, html): | |
# Implement parsing logic | |
soup = BeautifulSoup(html, 'html.parser') | |
# Extract data from soup | |
return data | |
async def process_url(self, session, url): | |
html = await self.fetch(session, url) | |
if html: | |
data = await self.parse(html) | |
await self.save(data) | |
async def save(self, data): | |
# Implement saving logic (database, file, etc.) | |
async with aiofiles.open('results.json', 'a') as f: | |
await f.write(json.dumps(data) + '\n') | |
async def run(self, urls): | |
async with aiohttp.ClientSession() as session: | |
tasks = [self.process_url(session, url) for url in urls] | |
await asyncio.gather(*tasks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment