Skip to content

Instantly share code, notes, and snippets.

@MinSomai
Last active February 23, 2025 15:49
Show Gist options
  • Save MinSomai/4dfe6324e7718e4b6ac1fdb59f661674 to your computer and use it in GitHub Desktop.
Save MinSomai/4dfe6324e7718e4b6ac1fdb59f661674 to your computer and use it in GitHub Desktop.
scalable scraping with puppeteer-cluster.
// Code structure
// app.js
// router.js
// controller.js
// service.js
// implementation
// This is hosted in aws Fargate. And this is the rough worst case calculation on the resource usage.
// Fargate configuration:
// - 8GB RAM per container
// - 25 (MAX) concurrent request per container.
// - 300MB RAM consumption per request. (this is a rough worst case from googling and we are using combination of cluster and new chrome instances).
// now,
// total_request_each_container_can_handle = (8GB RAM * 1000MB) / 300MB;
// total_request_each_container_can_handle = ~26 requests.;
// Initialize Puppeteer Cluster on server start
const { Cluster } = require('puppeteer-cluster');
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 3, // adjustable
puppeteerOptions: {
headless: true,
},
});
app.use('/api',
async (req, res, next) => {
req.cluster = cluster; // req.cluster is available for every request going through "/api" route
next();
},
routes);
// Start the Express server
app.listen(port, () => {
console.log(`Server is running on http://localhost:${port}`);
});
// Handle graceful shutdown
process.on('SIGINT', async () => {
await cluster.idle();
await cluster.close();
process.exit();
});
})();
// Router
Router.post("/", scraperController.initTask); // this goes through "/api" route
// Controller
initTask = async (req, res) => {
const cluster = req.cluster;
const url = req.body.url;
const result = await this.scraperService.initTask({ cluster, url }); // pass cluster to service
res.json({ message: "Success", data: result });
};
// Service
async initTask({ cluster, url }) {
const data = {
url
}
const scrapedData = await cluster.execute(data, myPuppeteerTask); // .execute() provides a "page", which is a new tab to the callback (myPuppeteerTask)
return scrapedData;
}
// Implementation for the callback above, We get a "page" to work with.
myPuppeteerTask = async ({ page, data }) => {
const { url } = data;
await page.goto(url);
// Do puppeteer things
return "Task Done"; // Or return what you want.
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment