MinSomai · February 23, 2025 15:49
diff --git a/scalable-scraping-puppeteer-cluster.js b/scalable-scraping-puppeteer-cluster.js
 // Code structure
 // app.js
 // router.js
 // controller.js
 // service.js
 // implementation

 // This is hosted in aws Fargate. And this is the rough worst case calculation on the resource usage.

 // Fargate configuration:
 // - 8GB RAM per container
 // - 25 (MAX) concurrent request per container. 
 // - 300MB RAM consumption per request. (this is a rough worst case from googling and we are using combination of cluster and new chrome instances).
 // now,
 //  total_request_each_container_can_handle = (8GB RAM * 1000MB) / 300MB;
 //  total_request_each_container_can_handle = ~26 requests.;

 // Initialize Puppeteer Cluster on server start
 const { Cluster } = require('puppeteer-cluster');

 (async () => {
  const cluster = await Cluster.launch({
    concurrency: Cluster.CONCURRENCY_PAGE,
    maxConcurrency: 3, // adjustable
    puppeteerOptions: {
      headless: true,
    },
  });

  app.use('/api',
    async (req, res, next) => {
      req.cluster = cluster; // req.cluster is available for every request going through "/api" route
      next();
    },
    routes);


  // Start the Express server
  app.listen(port, () => {
    console.log(`Server is running on http://localhost:${port}`);
  });

  // Handle graceful shutdown
  process.on('SIGINT', async () => {
    await cluster.idle();
    await cluster.close();
    process.exit();
  });
 })();

 // Router
 Router.post("/", scraperController.initTask); // this goes through "/api" route

 // Controller
  initTask = async (req, res) => {
      const cluster = req.cluster;
      const url = req.body.url;
      const result = await this.scraperService.initTask({ cluster, url }); // pass cluster to service
      res.json({ message: "Success", data: result });
  };

 // Service
  async initTask({ cluster, url }) {

    const data = {
      url
    }
    
    const scrapedData = await cluster.execute(data, myPuppeteerTask); // .execute() provides a "page", which is a new tab to the callback (myPuppeteerTask)
    return scrapedData;
  }

 // Implementation for the callback above, We get a "page" to work with.
  myPuppeteerTask = async ({ page, data }) => {
    const { url } = data;
    await page.goto(url);
    // Do puppeteer things
    return "Task Done"; // Or return what you want.
  })
	// Code structure
	// app.js
	// router.js
	// controller.js
	// service.js
	// implementation

	// This is hosted in aws Fargate. And this is the rough worst case calculation on the resource usage.

	// Fargate configuration:
	// - 8GB RAM per container
	// - 25 (MAX) concurrent request per container.
	// - 300MB RAM consumption per request. (this is a rough worst case from googling and we are using combination of cluster and new chrome instances).
	// now,
	// total_request_each_container_can_handle = (8GB RAM * 1000MB) / 300MB;
	// total_request_each_container_can_handle = ~26 requests.;

	// Initialize Puppeteer Cluster on server start
	const { Cluster } = require('puppeteer-cluster');

	(async () => {
	const cluster = await Cluster.launch({
	concurrency: Cluster.CONCURRENCY_PAGE,
	maxConcurrency: 3, // adjustable
	puppeteerOptions: {
	headless: true,
	},
	});

	app.use('/api',
	async (req, res, next) => {
	req.cluster = cluster; // req.cluster is available for every request going through "/api" route
	next();
	},
	routes);


	// Start the Express server
	app.listen(port, () => {
	console.log(`Server is running on http://localhost:${port}`);
	});

	// Handle graceful shutdown
	process.on('SIGINT', async () => {
	await cluster.idle();
	await cluster.close();
	process.exit();
	});
	})();

	// Router
	Router.post("/", scraperController.initTask); // this goes through "/api" route

	// Controller
	initTask = async (req, res) => {
	const cluster = req.cluster;
	const url = req.body.url;
	const result = await this.scraperService.initTask({ cluster, url }); // pass cluster to service
	res.json({ message: "Success", data: result });
	};

	// Service
	async initTask({ cluster, url }) {

	const data = {
	url
	}

	const scrapedData = await cluster.execute(data, myPuppeteerTask); // .execute() provides a "page", which is a new tab to the callback (myPuppeteerTask)
	return scrapedData;
	}

	// Implementation for the callback above, We get a "page" to work with.
	myPuppeteerTask = async ({ page, data }) => {
	const { url } = data;
	await page.goto(url);
	// Do puppeteer things
	return "Task Done"; // Or return what you want.
	})