Last active
March 5, 2020 12:05
-
-
Save mtrunkat/61ab1486254316138b4e1f813d5642ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function pageFunction(context) { | |
var $ = context.jQuery | |
var posts = $('.athing').toArray(); // All posts as array of DOM elements | |
var $moreLink = $('.morelink'); // Link to next page | |
// If crawler is scraping 2nd, 3rd, ... page then | |
// context.request.referrer.pageFunctionResult contains | |
// result from previous pages. | |
var prevResult = context.request.referrer | |
? context.request.referrer.pageFunctionResult | |
: {}; | |
if ($moreLink.length) { | |
// Enqueue next page to be crawled. | |
context.enqueuePage({ url: $moreLink.prop('href') }); | |
// If there is link to another page then we don't want to output | |
// result of this page in order to merge it with the next one | |
context.skipOutput(); | |
} | |
// Scrape the data and merge them with previous results. | |
return posts.reduce(function (result, el) { | |
var $el = $(el); | |
var link = $el.find('.storylink').attr('href'); | |
var rank = 101 - parseInt($el.find('.rank').text()); | |
result[link] = rank; | |
return result; | |
}, prevResult); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment