Created
March 3, 2022 13:31
-
-
Save pcampina/74348f082675b777b05bfac7ba2a0866 to your computer and use it in GitHub Desktop.
Craw All Internal URL's with Crawler npm package
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Author: Evyatar Meged | |
Collaborator: syedshabbir | |
Source: https://stackoverflow.com/questions/50154133/how-to-crawl-all-the-internal-urls-of-a-website-using-crawler | |
*/ | |
const Crawler = require('crawler'); | |
let obselete = []; // Array of what was crawled already | |
let c = new Crawler(); | |
function crawlAllUrls(url) { | |
console.log(`Crawling ${url}`); | |
c.queue({ | |
uri: url, | |
callback: function (err, res, done) { | |
if (err) throw err; | |
let $ = res.$; | |
try { | |
let urls = $('a'); | |
Object.keys(urls).forEach((item) => { | |
if (urls[item].type === 'tag') { | |
let href = urls[item].attribs.href; | |
if (href && !obselete.includes(href)) { | |
href = href.trim(); | |
obselete.push(href); | |
// Slow down the | |
setTimeout(function() { | |
href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) | |
}, 5000) | |
} | |
} | |
}); | |
} catch (e) { | |
console.error(`Encountered an error crawling ${url}. Aborting crawl.`); | |
done() | |
} | |
done(); | |
} | |
}) | |
} | |
crawlAllUrls('https://github.com/evyatarmeged/'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment