Created
November 18, 2024 02:12
-
-
Save ColeMurray/396a43d743784344b036df432d2a7b78 to your computer and use it in GitHub Desktop.
script for crawling a webpage and following links
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer-extra'); | |
const StealthPlugin = require('puppeteer-extra-plugin-stealth'); | |
const fs = require('fs').promises; | |
const path = require('path'); | |
const { URL } = require('url'); | |
const crypto = require('crypto'); | |
const argparse = require('argparse'); | |
puppeteer.use(StealthPlugin()); | |
const MAX_CONCURRENT_SCRAPES = 10; | |
const MAX_PAGES_PER_SITE = 100; | |
class Metrics { | |
constructor() { | |
this.urlsProcessed = 0; | |
this.urlsFailed = 0; | |
this.newUrlsFound = 0; | |
this.processingTime = 0; | |
} | |
logMetrics() { | |
const avgTime = this.processingTime / (this.urlsProcessed + this.urlsFailed) || 0; | |
console.log(`Metrics - Processed: ${this.urlsProcessed}, Failed: ${this.urlsFailed}, ` + | |
`New URLs: ${this.newUrlsFound}, Avg. Processing Time: ${avgTime.toFixed(2)}s`); | |
} | |
} | |
function normalizeUrl(url) { | |
const parsedUrl = new URL(url); | |
return `${parsedUrl.protocol}//${parsedUrl.hostname.replace(/^www\./, '')}${parsedUrl.pathname}`; | |
} | |
async function extractLinks(page, baseUrl) { | |
const links = await page.evaluate(() => { | |
const anchors = Array.from(document.querySelectorAll('a')); | |
return anchors.map(a => a.href).filter(href => href.startsWith('http')); | |
}); | |
return new Set(links.map(link => normalizeUrl(new URL(link, baseUrl).href))); | |
} | |
async function extractAllText(page) { | |
return page.evaluate(() => { | |
// This function runs in the context of the browser | |
function extractText(node) { | |
if (node.nodeType === Node.TEXT_NODE) { | |
return node.textContent; | |
} | |
if (node.nodeType !== Node.ELEMENT_NODE) { | |
return ''; | |
} | |
const tagName = node.tagName.toLowerCase(); | |
if (['script', 'style', 'noscript'].includes(tagName)) { | |
return ''; | |
} | |
let text = ''; | |
for (let child of node.childNodes) { | |
text += extractText(child); | |
} | |
if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'].includes(tagName)) { | |
text += '\n'; | |
} else if (tagName === 'br') { | |
text += '\n'; | |
} | |
return text; | |
} | |
return extractText(document.body).trim().replace(/\s+/g, ' '); | |
}); | |
} | |
async function scrapePage(page, url, outputDir, visited, metrics) { | |
if (visited.has(url)) return new Set(); | |
console.log(`Crawling: ${url}`); | |
visited.add(url); | |
try { | |
const startTime = Date.now(); | |
await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000 }); | |
// Auto-scroll | |
await page.evaluate(async () => { | |
await new Promise((resolve) => { | |
let totalHeight = 0; | |
const distance = 100; | |
const timer = setInterval(() => { | |
const scrollHeight = document.body.scrollHeight; | |
window.scrollBy(0, distance); | |
totalHeight += distance; | |
if (totalHeight >= scrollHeight) { | |
clearInterval(timer); | |
resolve(); | |
} | |
}, 100); | |
}); | |
}); | |
// Extract all text content | |
const textContent = await extractAllText(page); | |
const filename = `${crypto.createHash('md5').update(url).digest('hex')}.txt`; | |
const filepath = path.join(outputDir, filename); | |
await fs.writeFile(filepath, textContent); | |
const newLinks = await extractLinks(page, url); | |
const processingTime = (Date.now() - startTime) / 1000; | |
metrics.urlsProcessed++; | |
metrics.processingTime += processingTime; | |
metrics.newUrlsFound += newLinks.size; | |
console.log(`Successfully processed ${url} in ${processingTime.toFixed(2)}s`); | |
return newLinks; | |
} catch (error) { | |
console.error(`Error processing ${url}: ${error.message}`); | |
metrics.urlsFailed++; | |
return new Set(); | |
} | |
} | |
async function crawler(startUrl, outputDir, maxPages, restrictDomain) { | |
const metrics = new Metrics(); | |
const visited = new Set(); | |
const toVisit = new Set([startUrl]); | |
const baseDomain = new URL(startUrl).hostname.replace(/^www\./, ''); | |
await fs.mkdir(outputDir, { recursive: true }); | |
const browser = await puppeteer.launch({ | |
headless: true, | |
args: ['--no-sandbox', '--disable-setuid-sandbox'] | |
}); | |
const semaphore = new Array(MAX_CONCURRENT_SCRAPES).fill(Promise.resolve()); | |
while (toVisit.size > 0 && visited.size < maxPages) { | |
const url = toVisit.values().next().value; | |
toVisit.delete(url); | |
if (restrictDomain && new URL(url).hostname.replace(/^www\./, '') !== baseDomain) { | |
continue; | |
} | |
const page = await browser.newPage(); | |
try { | |
await Promise.race(semaphore); | |
const scrapePromise = scrapePage(page, url, outputDir, visited, metrics); | |
const index = semaphore.findIndex(p => p.isPending?.() === false); | |
semaphore[index] = scrapePromise; | |
const newLinks = await scrapePromise; | |
for (const link of newLinks) { | |
if (!visited.has(link)) { | |
toVisit.add(link); | |
} | |
} | |
} finally { | |
await page.close(); | |
} | |
} | |
await browser.close(); | |
metrics.logMetrics(); | |
} | |
function main() { | |
const parser = new argparse.ArgumentParser({ description: 'Web Crawler with Puppeteer and Stealth' }); | |
parser.add_argument('url', { help: 'The URL to start crawling from' }); | |
parser.add_argument('--output', { help: 'Output directory for crawled pages', default: 'output' }); | |
parser.add_argument('--restrict-domain', { action: 'store_true', help: 'Restrict crawling to the initial domain' }); | |
parser.add_argument('--max-pages', { type: 'int', default: 1000, help: 'Maximum number of pages to crawl' }); | |
const args = parser.parse_args(); | |
crawler(args.url, args.output, args.max_pages, args.restrict_domain) | |
.then(() => console.log('Crawling complete')) | |
.catch(error => console.error('An error occurred:', error)); | |
} | |
if (require.main === module) { | |
main(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment