Last active
December 4, 2018 05:19
-
-
Save franperezlopez/a5d2d6c9b99d501ea4195a36928f7c0c to your computer and use it in GitHub Desktop.
nightmare + vo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Nightmare = require ("nightmare"); | |
const vo = require("vo"); | |
function scrape (url: string, dataScraper: Function, urlScraper: Function, | |
injectJQuery: boolean = true, validate: Function = null) | |
{ | |
var item : any = null, urls: any = null; | |
// do not show window, do not load images (reduce time to ready()) | |
var nightmare = Nightmare({show: false, pollInterval: 800, webPreferences: {images: false}}); | |
function * workflow () { | |
// load url in Electron instance | |
yield nightmare.goto(url); | |
if (injectJQuery) | |
// injects jQuery library into loaded page (inserts script tag inside DOM) | |
yield nightmare.inject("js", "client\\jquery.js"); | |
if (dataScraper != null) | |
// evaluate javascript function into loaded page (client side) | |
item = yield nightmare.evaluate(dataScraper); | |
if (urlScraper != null) | |
// executes url workflow and returns result | |
urls = yield vo(urlScraper(nightmare)); | |
// destroys Electron instance | |
yield nightmare.end(); | |
} | |
return vo(workflow) | |
.catch((error) => {console.log(error);}) | |
.then (() => { | |
if (item != null && validate != null) | |
validate(item); | |
}); | |
} | |
function * urlScraper (nightmare) { | |
var urls = []; | |
var nextPage: boolean = true; | |
while (nextPage) { | |
// evaluates javascript function into loaded page (client side) | |
// tip: do not return jQuery objects to the node.js side | |
var urlsInPage = yield nightmare | |
.evaluate (function () { return jQuery.map($(".listing a"), (d) => d.href); }); | |
urls = urls.concat(urlsInPage); | |
// evaluates javascript function into loaded page (client side) | |
nextPage = yield nightmare.evaluate(function () {return $(".pagination a.next").length > 0;}); | |
if (nextPage) | |
// triggers click event into loaded page to load next page | |
yield nightmare.click(".pagination a.next"); | |
} | |
return urls; | |
} | |
scrape("http://anywebyouwanttoscrape.com", Function("return $(.email).text();"), urlScraper, true, null); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment