Skip to content

Instantly share code, notes, and snippets.

@ColeMurray
Created November 18, 2024 02:12
Show Gist options
  • Save ColeMurray/396a43d743784344b036df432d2a7b78 to your computer and use it in GitHub Desktop.
Save ColeMurray/396a43d743784344b036df432d2a7b78 to your computer and use it in GitHub Desktop.
script for crawling a webpage and following links
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const fs = require('fs').promises;
const path = require('path');
const { URL } = require('url');
const crypto = require('crypto');
const argparse = require('argparse');
puppeteer.use(StealthPlugin());
const MAX_CONCURRENT_SCRAPES = 10;
const MAX_PAGES_PER_SITE = 100;
class Metrics {
constructor() {
this.urlsProcessed = 0;
this.urlsFailed = 0;
this.newUrlsFound = 0;
this.processingTime = 0;
}
logMetrics() {
const avgTime = this.processingTime / (this.urlsProcessed + this.urlsFailed) || 0;
console.log(`Metrics - Processed: ${this.urlsProcessed}, Failed: ${this.urlsFailed}, ` +
`New URLs: ${this.newUrlsFound}, Avg. Processing Time: ${avgTime.toFixed(2)}s`);
}
}
function normalizeUrl(url) {
const parsedUrl = new URL(url);
return `${parsedUrl.protocol}//${parsedUrl.hostname.replace(/^www\./, '')}${parsedUrl.pathname}`;
}
async function extractLinks(page, baseUrl) {
const links = await page.evaluate(() => {
const anchors = Array.from(document.querySelectorAll('a'));
return anchors.map(a => a.href).filter(href => href.startsWith('http'));
});
return new Set(links.map(link => normalizeUrl(new URL(link, baseUrl).href)));
}
async function extractAllText(page) {
return page.evaluate(() => {
// This function runs in the context of the browser
function extractText(node) {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent;
}
if (node.nodeType !== Node.ELEMENT_NODE) {
return '';
}
const tagName = node.tagName.toLowerCase();
if (['script', 'style', 'noscript'].includes(tagName)) {
return '';
}
let text = '';
for (let child of node.childNodes) {
text += extractText(child);
}
if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'].includes(tagName)) {
text += '\n';
} else if (tagName === 'br') {
text += '\n';
}
return text;
}
return extractText(document.body).trim().replace(/\s+/g, ' ');
});
}
async function scrapePage(page, url, outputDir, visited, metrics) {
if (visited.has(url)) return new Set();
console.log(`Crawling: ${url}`);
visited.add(url);
try {
const startTime = Date.now();
await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 });
// Auto-scroll
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
// Extract all text content
const textContent = await extractAllText(page);
const filename = `${crypto.createHash('md5').update(url).digest('hex')}.txt`;
const filepath = path.join(outputDir, filename);
await fs.writeFile(filepath, textContent);
const newLinks = await extractLinks(page, url);
const processingTime = (Date.now() - startTime) / 1000;
metrics.urlsProcessed++;
metrics.processingTime += processingTime;
metrics.newUrlsFound += newLinks.size;
console.log(`Successfully processed ${url} in ${processingTime.toFixed(2)}s`);
return newLinks;
} catch (error) {
console.error(`Error processing ${url}: ${error.message}`);
metrics.urlsFailed++;
return new Set();
}
}
async function crawler(startUrl, outputDir, maxPages, restrictDomain) {
const metrics = new Metrics();
const visited = new Set();
const toVisit = new Set([startUrl]);
const baseDomain = new URL(startUrl).hostname.replace(/^www\./, '');
await fs.mkdir(outputDir, { recursive: true });
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const semaphore = new Array(MAX_CONCURRENT_SCRAPES).fill(Promise.resolve());
while (toVisit.size > 0 && visited.size < maxPages) {
const url = toVisit.values().next().value;
toVisit.delete(url);
if (restrictDomain && new URL(url).hostname.replace(/^www\./, '') !== baseDomain) {
continue;
}
const page = await browser.newPage();
try {
await Promise.race(semaphore);
const scrapePromise = scrapePage(page, url, outputDir, visited, metrics);
const index = semaphore.findIndex(p => p.isPending?.() === false);
semaphore[index] = scrapePromise;
const newLinks = await scrapePromise;
for (const link of newLinks) {
if (!visited.has(link)) {
toVisit.add(link);
}
}
} finally {
await page.close();
}
}
await browser.close();
metrics.logMetrics();
}
function main() {
const parser = new argparse.ArgumentParser({ description: 'Web Crawler with Puppeteer and Stealth' });
parser.add_argument('url', { help: 'The URL to start crawling from' });
parser.add_argument('--output', { help: 'Output directory for crawled pages', default: 'output' });
parser.add_argument('--restrict-domain', { action: 'store_true', help: 'Restrict crawling to the initial domain' });
parser.add_argument('--max-pages', { type: 'int', default: 1000, help: 'Maximum number of pages to crawl' });
const args = parser.parse_args();
crawler(args.url, args.output, args.max_pages, args.restrict_domain)
.then(() => console.log('Crawling complete'))
.catch(error => console.error('An error occurred:', error));
}
if (require.main === module) {
main();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment