Skip to content

Instantly share code, notes, and snippets.

@varaprasadh
Created January 2, 2025 17:18
Show Gist options
  • Save varaprasadh/0a12ae96a3cabc948ed50e8de1b550b9 to your computer and use it in GitHub Desktop.
Save varaprasadh/0a12ae96a3cabc948ed50e8de1b550b9 to your computer and use it in GitHub Desktop.
import aiohttp
import asyncio
import random
from collections import defaultdict
import time
from typing import List, Dict
import logging

class ProxyRotator:
    def __init__(self, proxy_list: List[str], requests_per_proxy: int = 100):
        self.proxies = proxy_list
        self.proxy_usage = defaultdict(int)
        self.proxy_last_use = defaultdict(float)
        self.requests_per_proxy = requests_per_proxy
        
    async def get_proxy(self) -> str:
        available_proxies = [
            p for p in self.proxies 
            if self.proxy_usage[p] < self.requests_per_proxy
            and time.time() - self.proxy_last_use[p] > 86400  # 24 hour cooldown
        ]
        if not available_proxies:
            raise Exception("No available proxies")
        
        proxy = random.choice(available_proxies)
        self.proxy_usage[proxy] += 1
        self.proxy_last_use[proxy] = time.time()
        return proxy

class ResilientScraper:
    def __init__(self, proxy_rotator: ProxyRotator):
        self.proxy_rotator = proxy_rotator
        self.session = None
        
    async def get_session(self):
        if not self.session:
            self.session = aiohttp.ClientSession()
        return self.session
        
    async def fetch_url(self, url: str, retries: int = 3) -> Dict:
        session = await self.get_session()
        
        for attempt in range(retries):
            try:
                proxy = await self.proxy_rotator.get_proxy()
                
                async with session.get(
                    url,
                    proxy=proxy,
                    headers={
                        'User-Agent': self._get_random_ua(),
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                        'Accept-Language': 'en-US,en;q=0.5',
                    },
                    timeout=30
                ) as response:
                    if response.status == 200:
                        return {
                            'url': url,
                            'content': await response.text(),
                            'status': response.status
                        }
                    
            except Exception as e:
                logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
                
        return {'url': url, 'content': None, 'status': 'failed'}

    def _get_random_ua(self) -> str:
        # Add a list of common user agents
        uas = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        ]
        return random.choice(uas)

# Docker compose file for proxy containers
docker_compose = """
version: '3'
services:
  proxy1:
    image: tor-proxy
    ports:
      - "9050:9050"
  proxy2:
    image: tor-proxy
    ports:
      - "9051:9050"
  proxy3:
    image: tor-proxy
    ports:
      - "9052:9050"
"""

# Dockerfile for Tor proxy
dockerfile = """
FROM alpine:latest
RUN apk add --no-cache tor
COPY torrc /etc/tor/torrc
EXPOSE 9050
CMD ["tor", "-f", "/etc/tor/torrc"]
"""

# Example usage:
async def main():
    # List of proxy addresses from your Docker containers
    proxies = [
        "socks5://localhost:9050",
        "socks5://localhost:9051",
        "socks5://localhost:9052",
    ]
    
    rotator = ProxyRotator(proxies)
    scraper = ResilientScraper(rotator)
    
    urls = [f"https://example.com/page/{i}" for i in range(100)]
    tasks = [scraper.fetch_url(url) for url in urls]
    
    results = await asyncio.gather(*tasks)
    return results

if __name__ == "__main__":
    asyncio.run(main())

To use this effectively:

  1. Create a network of proxy containers:
# Create a custom Tor proxy network
docker network create proxy-network

# Run multiple Tor proxy containers
for i in {1..5}; do
    docker run -d --name tor-proxy-$i --network proxy-network \
    -p 905$i:9050 dperson/torproxy
done
  1. Additional strategies you could implement:
  • Use residential proxy services (like Luminati, Oxylabs)
  • Implement request queueing with Redis
  • Add IP rotation scheduling
  • Use Selenium with proxy browser profiles
  • Implement cookie/session management

Here's a simple Docker Compose setup for multiple proxy containers:

version: '3'
services:
  scraper:
    build: .
    depends_on:
      - proxy1
      - proxy2
      - proxy3
    environment:
      - PROXY_ADDRESSES=proxy1:9050,proxy2:9050,proxy3:9050

  proxy1:
    image: dperson/torproxy
    restart: always

  proxy2:
    image: dperson/torproxy
    restart: always

  proxy3:
    image: dperson/torproxy
    restart: always
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment