Skip to content

Instantly share code, notes, and snippets.

@wlepinski
Forked from joseraya/snapshot-crawler.js
Created February 3, 2014 12:45

Revisions

  1. @joseraya joseraya created this gist Jan 21, 2014.
    105 changes: 105 additions & 0 deletions snapshot-crawler.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,105 @@
    var Browser = require('zombie'),
    url = require('url'),
    fs = require('fs'),
    $q = require('Q'),
    saveDir = __dirname + '/_snapshots';


    var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;

    var stripScriptTags = function(html) {
    return html.replace(scriptTagRegex, '');
    }

    var mkdirParent = function(dirPath, mode, callback) {
    //Call the standard fs.mkdir
    fs.mkdir(dirPath, mode, function(error) {
    //When it fail in this way, do the custom steps
    if (error && error.errno === 34) {
    //Create all the parents recursively
    fs.mkdirParent(path.dirname(dirPath), mode, callback);
    //And then the directory
    fs.mkdirParent(dirPath, mode, callback);
    }
    //Manually run the callback since we used our own callback to do all these
    callback && callback(error);
    });
    };


    var saveSnapshot = function(uri, body) {

    var lastIdx = uri.lastIndexOf('#!/');

    if (lastIdx < 0) {
    // If we're using html5mode
    path = url.parse(uri).pathname;
    } else {
    // If we're using hashbang mode
    path =
    uri.substring(lastIdx + 2, uri.length);
    }

    if (path === '/') path = "/index.html";

    if (path.indexOf('.html') == -1)
    path += ".html";

    var filename = saveDir + path;
    console.log("Saving ", uri, " to ", filename);
    var dirname = require("path").dirname(filename);
    mkdirParent(dirname);
    fs.open(filename, 'w', function(e, fd) {
    if (e) return;
    fs.write(fd, body);
    });


    };

    var browserOpts = {
    waitFor: "100ms",
    loadCSS: false,
    waitDuration: "100ms"
    }

    var browser = new Browser(browserOpts);

    var crawlPage = function(idx, arr) {
    // location = window.location
    if (idx < arr.length) {
    var uri = arr[idx];
    console.time("voy");
    var promise = browser.visit(uri)
    .then(function() {
    console.timeEnd("voy");
    var intervalId = setInterval(function() {
    console.log("checking status")
    var status = browser.body.getAttribute('data-status');
    console.log(status);
    if (status === "ready") {
    clearInterval(intervalId);
    // Turn links into absolute links
    // and save them, if we need to
    // and we haven't already crawled them
    var links = browser.queryAll('a');
    links.forEach(function(link) {
    var href = link.getAttribute('href');
    var absUrl = url.resolve(uri, href);
    link.setAttribute('href', absUrl);
    if (arr.indexOf(absUrl) < 0) {
    arr.push(absUrl);
    }
    });

    // Save
    saveSnapshot(uri, browser.html());
    // Call again on the next iteration
    crawlPage(idx+1, arr);
    }
    }, 500);
    var d = $q.defer();
    });
    }
    }
    crawlPage(0, ["http://localhost:4000/#!/"]);