Created
July 14, 2017 22:31
-
-
Save manviny/8c73b539facf6771ef702c429829d75b to your computer and use it in GitHub Desktop.
SEO Site Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @author Eric Tucker <[email protected]> | |
* Written for nodejs | |
* requires phantomjs and nightmarejs | |
* | |
* Accessible through command line as: | |
* node seo-scraper.js www.YOUR-WEBSITE.com | |
* | |
*/ | |
var Nightmare = require('nightmare'); | |
var nightmare = new Nightmare(); | |
var fs = require('fs'); | |
//This is our start page | |
var siteRoot = process.argv[2].indexOf('http://') === -1 ? 'http://'+process.argv[2] : process.argv[2]; | |
console.log(siteRoot); | |
//This is going to be our pages csv - We set headers here | |
var pageStream = fs.createWriteStream("pages.csv",{flags: 'a', encoding: 'utf8'}); | |
pageStream.write("\"Page\",\"Title\",\"Keywords\",\"Description\",\"H1\",\"H2\",\"H3\",\"H4\",\"H5\"\n"); | |
//This is going to be all anchors across all pages | |
var anchorStream = fs.createWriteStream("anchors.csv",{flags:'a', encoding: 'utf8'}); | |
anchorStream.write("\"Page\",\"Link\",\"Inner Text/HTML\"\n"); | |
var pagesCrawled = []; | |
var pagesNotCrawled = []; | |
//Crawl page and extract tags and stuff | |
function crawlPage(site) { | |
var anchorTags = document.getElementsByTagName('a'); | |
var anchors = []; | |
for(var i = 0; i < anchorTags.length; i++) { | |
var innerText = anchorTags[i].innerText.replace(/\r?\n|\r/g,"").trim(); | |
var row = {url:anchorTags[i].href.trim()}; | |
row.inner = innerText != '' ? innerText.replace(/"/gm, '""') : anchorTags[i].innerHTML.trim().replace(/\s\s|\t|\r?\n|\r/gm,' ').replace(/"/gm, '""'); | |
anchors.push(row); | |
} | |
var hTags = { | |
h1: [], | |
h2: [], | |
h3: [], | |
h4: [], | |
h5: [] | |
}; | |
for(var tag in hTags) { | |
var tagCollection = document.getElementsByTagName(tag); | |
if(tagCollection.length != 0) { | |
for(var t = 0; t < tagCollection.length; t++) { | |
hTags[tag].push(tagCollection[t].innerHTML.trim().replace(/\s\s/gm, "").replace(/"/gm, '""')); | |
} | |
} else { | |
hTags[tag].push('**NONE**'); | |
} | |
} | |
return { | |
url: document.URL.replace(site,''), | |
title: document.title, | |
keywords: document.getElementsByName('keywords')[0].getAttribute('content'), | |
description: document.getElementsByName('description')[0].getAttribute('content'), | |
anchors: anchors, | |
hTags: hTags | |
}; | |
} | |
Array.prototype.getUnique = function() { | |
var u = []; | |
for(var i = 0; i < this.length; i++) { | |
if( | |
//Standard for unique | |
u.indexOf(this[i]) === -1 | |
//Get rid of blanks and file downloads | |
&& this[i] != '' | |
&& this[i].indexOf('.pdf') === -1 | |
&& this[i].indexOf('#') === -1 | |
) { | |
u.push(this[i]); | |
} | |
} | |
return u; | |
}; | |
Array.prototype.removeArray = function(arrayToRemove) { | |
for(var i = 0; i < arrayToRemove.length; i++) { | |
var index = this.indexOf(arrayToRemove[i]); | |
if(index != -1) { | |
this.splice(index,1); | |
} | |
} | |
return this; | |
}; | |
//Set our evaluate function | |
Nightmare.prototype.seoScrape = function() { | |
return this.evaluate(crawlPage, function(value) { | |
var colDelim = '","' | |
,rowDelim = '"\r\n"'; | |
console.log('crawling '+ value.url); | |
if(pagesCrawled.indexOf(value.url) === -1) { | |
pagesCrawled.push(value.url); | |
} | |
for(var i = 0; i < value.anchors.length; i++) { | |
if(value.anchors[i].url.indexOf(siteRoot) != -1) { | |
//remove trailing slash for our crawling purposes (WONT AFFECT OUTPUT FILES) | |
if(value.anchors[i].url.length > 1 && value.anchors[i].url[value.anchors[i].url.length - 1] == '/') { | |
value.anchors[i].url = value.anchors[i].url.substring(0,value.anchors[i].url.length - 1); | |
} | |
//Make sure this is our start domain | |
pagesNotCrawled.push(value.anchors[i].url.replace(siteRoot,'')); | |
} | |
//Add the anchor tag info to our csv | |
anchorStream.write('"'+value.url + colDelim + value.anchors[i].url + colDelim + value.anchors[i].inner+"\"\n"); | |
} | |
pageStream.write( | |
'"'+value.url + '","' | |
+value.title + '","' | |
+value.keywords + '","' | |
+value.description + '","' | |
+value.hTags.h1.join(' | ') + '","' | |
+value.hTags.h2.join(' | ') + '","' | |
+value.hTags.h3.join(' | ') + '","' | |
+value.hTags.h4.join(' | ') + '","' | |
+value.hTags.h5.join(' | ') + '"\n"' | |
); | |
//Update our crawled and uncrawled pages | |
//This returns our unique array | |
pagesNotCrawled = pagesNotCrawled.getUnique(); | |
pagesNotCrawled = pagesNotCrawled.removeArray(pagesCrawled); | |
}, siteRoot ).run(function(err, nightmare){ | |
if(pagesNotCrawled.length != 0) { | |
console.log('Navigating to '+pagesNotCrawled[0]); | |
pagesCrawled.push(pagesNotCrawled[0]); | |
nightmare.goto(siteRoot+pagesNotCrawled[0]).seoScrape(); | |
} else { | |
console.log('Your Site Has Been Crawled!'); | |
} | |
}); | |
}; | |
nightmare.goto(siteRoot); | |
nightmare.seoScrape(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment