import aiohttp
import asyncio
import random
from collections import defaultdict
import time
from typing import List, Dict
import logging
class ProxyRotator:
def __init__(self, proxy_list: List[str], requests_per_proxy: int = 100):
self.proxies = proxy_list
self.proxy_usage = defaultdict(int)
self.proxy_last_use = defaultdict(float)
self.requests_per_proxy = requests_per_proxy
async def get_proxy(self) -> str:
available_proxies = [
p for p in self.proxies
if self.proxy_usage[p] < self.requests_per_proxy
and time.time() - self.proxy_last_use[p] > 86400 # 24 hour cooldown
]
if not available_proxies:
raise Exception("No available proxies")
proxy = random.choice(available_proxies)
self.proxy_usage[proxy] += 1
self.proxy_last_use[proxy] = time.time()
return proxy
class ResilientScraper:
def __init__(self, proxy_rotator: ProxyRotator):
self.proxy_rotator = proxy_rotator
self.session = None
async def get_session(self):
if not self.session:
self.session = aiohttp.ClientSession()
return self.session
async def fetch_url(self, url: str, retries: int = 3) -> Dict:
session = await self.get_session()
for attempt in range(retries):
try:
proxy = await self.proxy_rotator.get_proxy()
async with session.get(
url,
proxy=proxy,
headers={
'User-Agent': self._get_random_ua(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
},
timeout=30
) as response:
if response.status == 200:
return {
'url': url,
'content': await response.text(),
'status': response.status
}
except Exception as e:
logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
await asyncio.sleep(2 ** attempt) # Exponential backoff
return {'url': url, 'content': None, 'status': 'failed'}
def _get_random_ua(self) -> str:
# Add a list of common user agents
uas = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
return random.choice(uas)
# Docker compose file for proxy containers
docker_compose = """
version: '3'
services:
proxy1:
image: tor-proxy
ports:
- "9050:9050"
proxy2:
image: tor-proxy
ports:
- "9051:9050"
proxy3:
image: tor-proxy
ports:
- "9052:9050"
"""
# Dockerfile for Tor proxy
dockerfile = """
FROM alpine:latest
RUN apk add --no-cache tor
COPY torrc /etc/tor/torrc
EXPOSE 9050
CMD ["tor", "-f", "/etc/tor/torrc"]
"""
# Example usage:
async def main():
# List of proxy addresses from your Docker containers
proxies = [
"socks5://localhost:9050",
"socks5://localhost:9051",
"socks5://localhost:9052",
]
rotator = ProxyRotator(proxies)
scraper = ResilientScraper(rotator)
urls = [f"https://example.com/page/{i}" for i in range(100)]
tasks = [scraper.fetch_url(url) for url in urls]
results = await asyncio.gather(*tasks)
return results
if __name__ == "__main__":
asyncio.run(main())
To use this effectively:
- Create a network of proxy containers:
# Create a custom Tor proxy network
docker network create proxy-network
# Run multiple Tor proxy containers
for i in {1..5}; do
docker run -d --name tor-proxy-$i --network proxy-network \
-p 905$i:9050 dperson/torproxy
done
- Additional strategies you could implement:
- Use residential proxy services (like Luminati, Oxylabs)
- Implement request queueing with Redis
- Add IP rotation scheduling
- Use Selenium with proxy browser profiles
- Implement cookie/session management
Here's a simple Docker Compose setup for multiple proxy containers:
version: '3'
services:
scraper:
build: .
depends_on:
- proxy1
- proxy2
- proxy3
environment:
- PROXY_ADDRESSES=proxy1:9050,proxy2:9050,proxy3:9050
proxy1:
image: dperson/torproxy
restart: always
proxy2:
image: dperson/torproxy
restart: always
proxy3:
image: dperson/torproxy
restart: always