Last active
February 23, 2025 15:49
-
-
Save MinSomai/4dfe6324e7718e4b6ac1fdb59f661674 to your computer and use it in GitHub Desktop.
scalable scraping with puppeteer-cluster.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Code structure | |
// app.js | |
// router.js | |
// controller.js | |
// service.js | |
// implementation | |
// This is hosted in aws Fargate. And this is the rough worst case calculation on the resource usage. | |
// Fargate configuration: | |
// - 8GB RAM per container | |
// - 25 (MAX) concurrent request per container. | |
// - 300MB RAM consumption per request. (this is a rough worst case from googling and we are using combination of cluster and new chrome instances). | |
// now, | |
// total_request_each_container_can_handle = (8GB RAM * 1000MB) / 300MB; | |
// total_request_each_container_can_handle = ~26 requests.; | |
// Initialize Puppeteer Cluster on server start | |
const { Cluster } = require('puppeteer-cluster'); | |
(async () => { | |
const cluster = await Cluster.launch({ | |
concurrency: Cluster.CONCURRENCY_PAGE, | |
maxConcurrency: 3, // adjustable | |
puppeteerOptions: { | |
headless: true, | |
}, | |
}); | |
app.use('/api', | |
async (req, res, next) => { | |
req.cluster = cluster; // req.cluster is available for every request going through "/api" route | |
next(); | |
}, | |
routes); | |
// Start the Express server | |
app.listen(port, () => { | |
console.log(`Server is running on http://localhost:${port}`); | |
}); | |
// Handle graceful shutdown | |
process.on('SIGINT', async () => { | |
await cluster.idle(); | |
await cluster.close(); | |
process.exit(); | |
}); | |
})(); | |
// Router | |
Router.post("/", scraperController.initTask); // this goes through "/api" route | |
// Controller | |
initTask = async (req, res) => { | |
const cluster = req.cluster; | |
const url = req.body.url; | |
const result = await this.scraperService.initTask({ cluster, url }); // pass cluster to service | |
res.json({ message: "Success", data: result }); | |
}; | |
// Service | |
async initTask({ cluster, url }) { | |
const data = { | |
url | |
} | |
const scrapedData = await cluster.execute(data, myPuppeteerTask); // .execute() provides a "page", which is a new tab to the callback (myPuppeteerTask) | |
return scrapedData; | |
} | |
// Implementation for the callback above, We get a "page" to work with. | |
myPuppeteerTask = async ({ page, data }) => { | |
const { url } = data; | |
await page.goto(url); | |
// Do puppeteer things | |
return "Task Done"; // Or return what you want. | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment