Skip to content

Instantly share code, notes, and snippets.

@Bertus-W
Created July 28, 2024 16:17
Show Gist options
  • Save Bertus-W/1029cdfec88ccb2057564c0834a702e2 to your computer and use it in GitHub Desktop.
Save Bertus-W/1029cdfec88ccb2057564c0834a702e2 to your computer and use it in GitHub Desktop.
Scrapy Playwright RAM Management
import asyncio
from asyncio import Lock
from playwright.async_api import async_playwright
from scrapy import signals
from scrapy.http import HtmlResponse
class PlaywrightMiddleware:
def __init__(self, crawler):
self.crawler = crawler
self.playwright = None
self.browser = None
self.context = None
self.page_counter = 0
self.restart_limit = 500
self.lock = Lock()
@classmethod
def from_crawler(cls, crawler):
middleware = cls(crawler)
crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
async def init_browser(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.firefox.launch(headless=True)
self.context = await self.browser.new_context()
async def close_browser(self):
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
async def restart_browser(self):
print("Closing browser")
await self.close_browser()
print("Init browser")
await self.init_browser()
def spider_opened(self, spider):
asyncio.get_event_loop().run_until_complete(self.init_browser())
def spider_closed(self, spider):
asyncio.get_event_loop().run_until_complete(self.close_browser())
async def process_request(self, request, spider):
async with self.lock:
self.page_counter += 1
if self.page_counter >= self.restart_limit:
self.page_counter = 0
spider.logger.info("Restarting browser instance...")
print("Pausing crawler")
self.crawler.engine.pause()
print("restart browser")
await self.restart_browser()
print("restarted browser!")
self.crawler.engine.unpause()
print("unpaused crawler!")
spider.logger.info("Browser restart complete, resuming requests")
page = await self.context.new_page()
response = await page.goto(request.url)
content = await response.text()
await page.close()
return HtmlResponse(
url=request.url,
body=content,
encoding='utf-8',
request=request
)
def process_exception(self, request, exception, spider):
# Re-queue the request in case of any other exceptions
spider.crawler.engine.crawl(request)
@Bertus-W
Copy link
Author

This middleware for Scrapy + Playwright restarts the browser engine after x-amount of requests. This seems to help mitigate the RAM filling up over time.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment