|
|
@@ -0,0 +1,105 @@ |
|
|
var Browser = require('zombie'), |
|
|
url = require('url'), |
|
|
fs = require('fs'), |
|
|
$q = require('Q'), |
|
|
saveDir = __dirname + '/_snapshots'; |
|
|
|
|
|
|
|
|
var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi; |
|
|
|
|
|
var stripScriptTags = function(html) { |
|
|
return html.replace(scriptTagRegex, ''); |
|
|
} |
|
|
|
|
|
var mkdirParent = function(dirPath, mode, callback) { |
|
|
//Call the standard fs.mkdir |
|
|
fs.mkdir(dirPath, mode, function(error) { |
|
|
//When it fail in this way, do the custom steps |
|
|
if (error && error.errno === 34) { |
|
|
//Create all the parents recursively |
|
|
fs.mkdirParent(path.dirname(dirPath), mode, callback); |
|
|
//And then the directory |
|
|
fs.mkdirParent(dirPath, mode, callback); |
|
|
} |
|
|
//Manually run the callback since we used our own callback to do all these |
|
|
callback && callback(error); |
|
|
}); |
|
|
}; |
|
|
|
|
|
|
|
|
var saveSnapshot = function(uri, body) { |
|
|
|
|
|
var lastIdx = uri.lastIndexOf('#!/'); |
|
|
|
|
|
if (lastIdx < 0) { |
|
|
// If we're using html5mode |
|
|
path = url.parse(uri).pathname; |
|
|
} else { |
|
|
// If we're using hashbang mode |
|
|
path = |
|
|
uri.substring(lastIdx + 2, uri.length); |
|
|
} |
|
|
|
|
|
if (path === '/') path = "/index.html"; |
|
|
|
|
|
if (path.indexOf('.html') == -1) |
|
|
path += ".html"; |
|
|
|
|
|
var filename = saveDir + path; |
|
|
console.log("Saving ", uri, " to ", filename); |
|
|
var dirname = require("path").dirname(filename); |
|
|
mkdirParent(dirname); |
|
|
fs.open(filename, 'w', function(e, fd) { |
|
|
if (e) return; |
|
|
fs.write(fd, body); |
|
|
}); |
|
|
|
|
|
|
|
|
}; |
|
|
|
|
|
var browserOpts = { |
|
|
waitFor: "100ms", |
|
|
loadCSS: false, |
|
|
waitDuration: "100ms" |
|
|
} |
|
|
|
|
|
var browser = new Browser(browserOpts); |
|
|
|
|
|
var crawlPage = function(idx, arr) { |
|
|
// location = window.location |
|
|
if (idx < arr.length) { |
|
|
var uri = arr[idx]; |
|
|
console.time("voy"); |
|
|
var promise = browser.visit(uri) |
|
|
.then(function() { |
|
|
console.timeEnd("voy"); |
|
|
var intervalId = setInterval(function() { |
|
|
console.log("checking status") |
|
|
var status = browser.body.getAttribute('data-status'); |
|
|
console.log(status); |
|
|
if (status === "ready") { |
|
|
clearInterval(intervalId); |
|
|
// Turn links into absolute links |
|
|
// and save them, if we need to |
|
|
// and we haven't already crawled them |
|
|
var links = browser.queryAll('a'); |
|
|
links.forEach(function(link) { |
|
|
var href = link.getAttribute('href'); |
|
|
var absUrl = url.resolve(uri, href); |
|
|
link.setAttribute('href', absUrl); |
|
|
if (arr.indexOf(absUrl) < 0) { |
|
|
arr.push(absUrl); |
|
|
} |
|
|
}); |
|
|
|
|
|
// Save |
|
|
saveSnapshot(uri, browser.html()); |
|
|
// Call again on the next iteration |
|
|
crawlPage(idx+1, arr); |
|
|
} |
|
|
}, 500); |
|
|
var d = $q.defer(); |
|
|
}); |
|
|
} |
|
|
} |
|
|
crawlPage(0, ["http://localhost:4000/#!/"]); |