Created
July 28, 2024 16:17
-
-
Save Bertus-W/1029cdfec88ccb2057564c0834a702e2 to your computer and use it in GitHub Desktop.
Scrapy Playwright RAM Management
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| from asyncio import Lock | |
| from playwright.async_api import async_playwright | |
| from scrapy import signals | |
| from scrapy.http import HtmlResponse | |
| class PlaywrightMiddleware: | |
| def __init__(self, crawler): | |
| self.crawler = crawler | |
| self.playwright = None | |
| self.browser = None | |
| self.context = None | |
| self.page_counter = 0 | |
| self.restart_limit = 500 | |
| self.lock = Lock() | |
| @classmethod | |
| def from_crawler(cls, crawler): | |
| middleware = cls(crawler) | |
| crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened) | |
| crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) | |
| return middleware | |
| async def init_browser(self): | |
| self.playwright = await async_playwright().start() | |
| self.browser = await self.playwright.firefox.launch(headless=True) | |
| self.context = await self.browser.new_context() | |
| async def close_browser(self): | |
| if self.context: | |
| await self.context.close() | |
| if self.browser: | |
| await self.browser.close() | |
| if self.playwright: | |
| await self.playwright.stop() | |
| async def restart_browser(self): | |
| print("Closing browser") | |
| await self.close_browser() | |
| print("Init browser") | |
| await self.init_browser() | |
| def spider_opened(self, spider): | |
| asyncio.get_event_loop().run_until_complete(self.init_browser()) | |
| def spider_closed(self, spider): | |
| asyncio.get_event_loop().run_until_complete(self.close_browser()) | |
| async def process_request(self, request, spider): | |
| async with self.lock: | |
| self.page_counter += 1 | |
| if self.page_counter >= self.restart_limit: | |
| self.page_counter = 0 | |
| spider.logger.info("Restarting browser instance...") | |
| print("Pausing crawler") | |
| self.crawler.engine.pause() | |
| print("restart browser") | |
| await self.restart_browser() | |
| print("restarted browser!") | |
| self.crawler.engine.unpause() | |
| print("unpaused crawler!") | |
| spider.logger.info("Browser restart complete, resuming requests") | |
| page = await self.context.new_page() | |
| response = await page.goto(request.url) | |
| content = await response.text() | |
| await page.close() | |
| return HtmlResponse( | |
| url=request.url, | |
| body=content, | |
| encoding='utf-8', | |
| request=request | |
| ) | |
| def process_exception(self, request, exception, spider): | |
| # Re-queue the request in case of any other exceptions | |
| spider.crawler.engine.crawl(request) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This middleware for Scrapy + Playwright restarts the browser engine after x-amount of requests. This seems to help mitigate the RAM filling up over time.