ColeMurray · November 18, 2024 02:12
diff --git a/crawler.js b/crawler.js
 const puppeteer = require('puppeteer-extra');
 const StealthPlugin = require('puppeteer-extra-plugin-stealth');
 const fs = require('fs').promises;
 const path = require('path');
 const { URL } = require('url');
 const crypto = require('crypto');
 const argparse = require('argparse');

 puppeteer.use(StealthPlugin());

 const MAX_CONCURRENT_SCRAPES = 10;
 const MAX_PAGES_PER_SITE = 100;

 class Metrics {
    constructor() {
        this.urlsProcessed = 0;
        this.urlsFailed = 0;
        this.newUrlsFound = 0;
        this.processingTime = 0;
    }

    logMetrics() {
        const avgTime = this.processingTime / (this.urlsProcessed + this.urlsFailed) || 0;
        console.log(`Metrics - Processed: ${this.urlsProcessed}, Failed: ${this.urlsFailed}, ` +
                    `New URLs: ${this.newUrlsFound}, Avg. Processing Time: ${avgTime.toFixed(2)}s`);
    }
 }

 function normalizeUrl(url) {
    const parsedUrl = new URL(url);
    return `${parsedUrl.protocol}//${parsedUrl.hostname.replace(/^www\./, '')}${parsedUrl.pathname}`;
 }

 async function extractLinks(page, baseUrl) {
    const links = await page.evaluate(() => {
        const anchors = Array.from(document.querySelectorAll('a'));
        return anchors.map(a => a.href).filter(href => href.startsWith('http'));
    });
    return new Set(links.map(link => normalizeUrl(new URL(link, baseUrl).href)));
 }

 async function extractAllText(page) {
    return page.evaluate(() => {
        // This function runs in the context of the browser
        function extractText(node) {
            if (node.nodeType === Node.TEXT_NODE) {
                return node.textContent;
            }
            if (node.nodeType !== Node.ELEMENT_NODE) {
                return '';
            }
            
            const tagName = node.tagName.toLowerCase();
            if (['script', 'style', 'noscript'].includes(tagName)) {
                return '';
            }

            let text = '';
            for (let child of node.childNodes) {
                text += extractText(child);
            }

            if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'].includes(tagName)) {
                text += '\n';
            } else if (tagName === 'br') {
                text += '\n';
            }

            return text;
        }

        return extractText(document.body).trim().replace(/\s+/g, ' ');
    });
 }

 async function scrapePage(page, url, outputDir, visited, metrics) {
    if (visited.has(url)) return new Set();

    console.log(`Crawling: ${url}`);
    visited.add(url);

    try {
        const startTime = Date.now();

        await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 });

        // Auto-scroll
        await page.evaluate(async () => {
            await new Promise((resolve) => {
                let totalHeight = 0;
                const distance = 100;
                const timer = setInterval(() => {
                    const scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    if (totalHeight >= scrollHeight) {
                        clearInterval(timer);
                        resolve();
                    }
                }, 100);
            });
        });

        // Extract all text content
        const textContent = await extractAllText(page);

        const filename = `${crypto.createHash('md5').update(url).digest('hex')}.txt`;
        const filepath = path.join(outputDir, filename);
        await fs.writeFile(filepath, textContent);

        const newLinks = await extractLinks(page, url);
        const processingTime = (Date.now() - startTime) / 1000;

        metrics.urlsProcessed++;
        metrics.processingTime += processingTime;
        metrics.newUrlsFound += newLinks.size;

        console.log(`Successfully processed ${url} in ${processingTime.toFixed(2)}s`);
        return newLinks;
    } catch (error) {
        console.error(`Error processing ${url}: ${error.message}`);
        metrics.urlsFailed++;
        return new Set();
    }
 }

 async function crawler(startUrl, outputDir, maxPages, restrictDomain) {
    const metrics = new Metrics();
    const visited = new Set();
    const toVisit = new Set([startUrl]);
    const baseDomain = new URL(startUrl).hostname.replace(/^www\./, '');

    await fs.mkdir(outputDir, { recursive: true });

    const browser = await puppeteer.launch({ 
        headless: true,
        args: ['--no-sandbox', '--disable-setuid-sandbox']
    });

    const semaphore = new Array(MAX_CONCURRENT_SCRAPES).fill(Promise.resolve());

    while (toVisit.size > 0 && visited.size < maxPages) {
        const url = toVisit.values().next().value;
        toVisit.delete(url);

        if (restrictDomain && new URL(url).hostname.replace(/^www\./, '') !== baseDomain) {
            continue;
        }

        const page = await browser.newPage();
        try {
            await Promise.race(semaphore);
            const scrapePromise = scrapePage(page, url, outputDir, visited, metrics);
            const index = semaphore.findIndex(p => p.isPending?.() === false);
            semaphore[index] = scrapePromise;
            const newLinks = await scrapePromise;
            for (const link of newLinks) {
                if (!visited.has(link)) {
                    toVisit.add(link);
                }
            }
        } finally {
            await page.close();
        }
    }

    await browser.close();
    metrics.logMetrics();
 }

 function main() {
    const parser = new argparse.ArgumentParser({ description: 'Web Crawler with Puppeteer and Stealth' });
    parser.add_argument('url', { help: 'The URL to start crawling from' });
    parser.add_argument('--output', { help: 'Output directory for crawled pages', default: 'output' });
    parser.add_argument('--restrict-domain', { action: 'store_true', help: 'Restrict crawling to the initial domain' });
    parser.add_argument('--max-pages', { type: 'int', default: 1000, help: 'Maximum number of pages to crawl' });

    const args = parser.parse_args();

    crawler(args.url, args.output, args.max_pages, args.restrict_domain)
        .then(() => console.log('Crawling complete'))
        .catch(error => console.error('An error occurred:', error));
 }

 if (require.main === module) {
    main();
 }
	const puppeteer = require('puppeteer-extra');
	const StealthPlugin = require('puppeteer-extra-plugin-stealth');
	const fs = require('fs').promises;
	const path = require('path');
	const { URL } = require('url');
	const crypto = require('crypto');
	const argparse = require('argparse');

	puppeteer.use(StealthPlugin());

	const MAX_CONCURRENT_SCRAPES = 10;
	const MAX_PAGES_PER_SITE = 100;

	class Metrics {
	constructor() {
	this.urlsProcessed = 0;
	this.urlsFailed = 0;
	this.newUrlsFound = 0;
	this.processingTime = 0;
	}

	logMetrics() {
	const avgTime = this.processingTime / (this.urlsProcessed + this.urlsFailed) \|\| 0;
	console.log(`Metrics - Processed: ${this.urlsProcessed}, Failed: ${this.urlsFailed}, ` +
	`New URLs: ${this.newUrlsFound}, Avg. Processing Time: ${avgTime.toFixed(2)}s`);
	}
	}

	function normalizeUrl(url) {
	const parsedUrl = new URL(url);
	return `${parsedUrl.protocol}//${parsedUrl.hostname.replace(/^www\./, '')}${parsedUrl.pathname}`;
	}

	async function extractLinks(page, baseUrl) {
	const links = await page.evaluate(() => {
	const anchors = Array.from(document.querySelectorAll('a'));
	return anchors.map(a => a.href).filter(href => href.startsWith('http'));
	});
	return new Set(links.map(link => normalizeUrl(new URL(link, baseUrl).href)));
	}

	async function extractAllText(page) {
	return page.evaluate(() => {
	// This function runs in the context of the browser
	function extractText(node) {
	if (node.nodeType === Node.TEXT_NODE) {
	return node.textContent;
	}
	if (node.nodeType !== Node.ELEMENT_NODE) {
	return '';
	}

	const tagName = node.tagName.toLowerCase();
	if (['script', 'style', 'noscript'].includes(tagName)) {
	return '';
	}

	let text = '';
	for (let child of node.childNodes) {
	text += extractText(child);
	}

	if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'].includes(tagName)) {
	text += '\n';
	} else if (tagName === 'br') {
	text += '\n';
	}

	return text;
	}

	return extractText(document.body).trim().replace(/\s+/g, ' ');
	});
	}

	async function scrapePage(page, url, outputDir, visited, metrics) {
	if (visited.has(url)) return new Set();

	console.log(`Crawling: ${url}`);
	visited.add(url);

	try {
	const startTime = Date.now();

	await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 });

	// Auto-scroll
	await page.evaluate(async () => {
	await new Promise((resolve) => {
	let totalHeight = 0;
	const distance = 100;
	const timer = setInterval(() => {
	const scrollHeight = document.body.scrollHeight;
	window.scrollBy(0, distance);
	totalHeight += distance;
	if (totalHeight >= scrollHeight) {
	clearInterval(timer);
	resolve();
	}
	}, 100);
	});
	});

	// Extract all text content
	const textContent = await extractAllText(page);

	const filename = `${crypto.createHash('md5').update(url).digest('hex')}.txt`;
	const filepath = path.join(outputDir, filename);
	await fs.writeFile(filepath, textContent);

	const newLinks = await extractLinks(page, url);
	const processingTime = (Date.now() - startTime) / 1000;

	metrics.urlsProcessed++;
	metrics.processingTime += processingTime;
	metrics.newUrlsFound += newLinks.size;

	console.log(`Successfully processed ${url} in ${processingTime.toFixed(2)}s`);
	return newLinks;
	} catch (error) {
	console.error(`Error processing ${url}: ${error.message}`);
	metrics.urlsFailed++;
	return new Set();
	}
	}

	async function crawler(startUrl, outputDir, maxPages, restrictDomain) {
	const metrics = new Metrics();
	const visited = new Set();
	const toVisit = new Set([startUrl]);
	const baseDomain = new URL(startUrl).hostname.replace(/^www\./, '');

	await fs.mkdir(outputDir, { recursive: true });

	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox']
	});

	const semaphore = new Array(MAX_CONCURRENT_SCRAPES).fill(Promise.resolve());

	while (toVisit.size > 0 && visited.size < maxPages) {
	const url = toVisit.values().next().value;
	toVisit.delete(url);

	if (restrictDomain && new URL(url).hostname.replace(/^www\./, '') !== baseDomain) {
	continue;
	}

	const page = await browser.newPage();
	try {
	await Promise.race(semaphore);
	const scrapePromise = scrapePage(page, url, outputDir, visited, metrics);
	const index = semaphore.findIndex(p => p.isPending?.() === false);
	semaphore[index] = scrapePromise;
	const newLinks = await scrapePromise;
	for (const link of newLinks) {
	if (!visited.has(link)) {
	toVisit.add(link);
	}
	}
	} finally {
	await page.close();
	}
	}

	await browser.close();
	metrics.logMetrics();
	}

	function main() {
	const parser = new argparse.ArgumentParser({ description: 'Web Crawler with Puppeteer and Stealth' });
	parser.add_argument('url', { help: 'The URL to start crawling from' });
	parser.add_argument('--output', { help: 'Output directory for crawled pages', default: 'output' });
	parser.add_argument('--restrict-domain', { action: 'store_true', help: 'Restrict crawling to the initial domain' });
	parser.add_argument('--max-pages', { type: 'int', default: 1000, help: 'Maximum number of pages to crawl' });

	const args = parser.parse_args();

	crawler(args.url, args.output, args.max_pages, args.restrict_domain)
	.then(() => console.log('Crawling complete'))
	.catch(error => console.error('An error occurred:', error));
	}

	if (require.main === module) {
	main();
	}