Created
December 4, 2016 14:43
-
-
Save rubenvereecken/c66020024f0b658c86ee04ad29e34d8c to your computer and use it in GitHub Desktop.
Scrape www.gratisrijbewijsonline.be for pretty PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// For use with phantomjs | |
// | |
// Assuming is a list of URLs from http://www.gratisrijbewijsonline.be | |
// Scrape each URL using | |
// | |
// for link in $(<links.txt) | |
// do | |
// phantomjs extract.js $link | |
// done | |
var page = require('webpage').create(); | |
var system = require('system'); | |
var address = system.args[1]; | |
page.viewPortSize = { width: 825, height: 600 }; | |
page.paperSize = { format: 'A4', | |
margin: { top: '15px', bottom: '15px' } | |
} | |
page.open(address, function (status) { | |
if (status !== 'success') { | |
console.log('Unable to load the address!'); | |
phantom.exit(1); | |
} else { | |
// Cleanup duty | |
page.evaluate(function() { | |
$('header').remove(); | |
$('p').filter(function(i, el) { return el.textContent.indexOf('Bekijk eerst') == 0 }).remove(); | |
$('iframe').remove() | |
$('main > hr')[0].remove() | |
$('.box-blue').remove(); | |
$('footer').remove(); | |
var end = $('h2').filter(function(i, el) { return el.textContent == 'Oefenvragen' }); | |
var next = null; | |
while (end.length) { | |
next = end.next(); | |
end.remove(); | |
end = next; | |
} | |
$('a').filter(function(i, el) { | |
return el.href.indexOf('derijprof') >= 0; | |
}).remove(); | |
$('mb-10').remove(); | |
$('hr').each(function(i, el) { | |
if (!$(el).next().length) { | |
$(el).remove(); | |
} else if ($(el).next().get(0).tagName == 'HR') { | |
$(el).remove(); | |
} | |
}); | |
}); | |
// This bit makes sure learning boxes are not interrupted by page breaks, | |
// so a title, an image and the corresponding explanation are guaranteed | |
// to be on the same page | |
page.evaluate(function() { | |
$('h2').each(function(i, el) { | |
if ( $(el).next().get(0).tagName.toLowerCase() != 'h3' ) return; | |
if ( $(el).next().next().get(0).tagName.toLowerCase() != 'table' ) return; | |
var p = document.createElement('div'); | |
$(el).clone().appendTo(p); | |
$(el).next().clone().appendTo(p); | |
$(el).next().next().clone().appendTo(p); | |
var prev = $(el).prev(); | |
$(el).next().next().remove(); | |
$(el).next().remove(); | |
$(el).remove(); | |
$(p).css('page-break-inside', 'avoid'); | |
// $(p).css('border', '1px black solid'); // For debugging | |
$(p).insertAfter(prev); | |
}) | |
$('h2, h3').each(function(i, el) { | |
if ( $(el).next().get(0).tagName.toLowerCase() != 'table' ) return; | |
var p = document.createElement('div'); | |
$(el).clone().appendTo(p); | |
$(el).next().clone().appendTo(p); | |
var prev = $(el).prev(); | |
var lesson = $(el).next().remove(); | |
var sub = $(el).remove(); | |
$(p).css('page-break-inside', 'avoid'); | |
// $(p).css('border', '1px black solid'); // For debugging | |
$(p).insertAfter(prev); | |
}) | |
}) | |
// Restyling | |
var font = page.evaluate(function() { | |
$('html').css('zoom', 0.68) | |
$('body').css('background', 'none'); | |
$('main').css('margin-top', '0px').css('border-radius', '0px') | |
return $('p').css('font'); | |
}) | |
var fullTitle = page.evaluate(function() { | |
return $('h1').get(0).textContent; | |
}) | |
console.log(fullTitle); | |
window.setTimeout(function () { | |
page.render(fullTitle + '.pdf'); | |
phantom.exit(); | |
}, 200); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment