Skip to content

Instantly share code, notes, and snippets.

@rubenvereecken
Created December 4, 2016 14:43
Show Gist options
  • Save rubenvereecken/c66020024f0b658c86ee04ad29e34d8c to your computer and use it in GitHub Desktop.
Save rubenvereecken/c66020024f0b658c86ee04ad29e34d8c to your computer and use it in GitHub Desktop.
Scrape www.gratisrijbewijsonline.be for pretty PDFs
// For use with phantomjs
//
// Assuming is a list of URLs from http://www.gratisrijbewijsonline.be
// Scrape each URL using
//
// for link in $(<links.txt)
// do
// phantomjs extract.js $link
// done
var page = require('webpage').create();
var system = require('system');
var address = system.args[1];
page.viewPortSize = { width: 825, height: 600 };
page.paperSize = { format: 'A4',
margin: { top: '15px', bottom: '15px' }
}
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit(1);
} else {
// Cleanup duty
page.evaluate(function() {
$('header').remove();
$('p').filter(function(i, el) { return el.textContent.indexOf('Bekijk eerst') == 0 }).remove();
$('iframe').remove()
$('main > hr')[0].remove()
$('.box-blue').remove();
$('footer').remove();
var end = $('h2').filter(function(i, el) { return el.textContent == 'Oefenvragen' });
var next = null;
while (end.length) {
next = end.next();
end.remove();
end = next;
}
$('a').filter(function(i, el) {
return el.href.indexOf('derijprof') >= 0;
}).remove();
$('mb-10').remove();
$('hr').each(function(i, el) {
if (!$(el).next().length) {
$(el).remove();
} else if ($(el).next().get(0).tagName == 'HR') {
$(el).remove();
}
});
});
// This bit makes sure learning boxes are not interrupted by page breaks,
// so a title, an image and the corresponding explanation are guaranteed
// to be on the same page
page.evaluate(function() {
$('h2').each(function(i, el) {
if ( $(el).next().get(0).tagName.toLowerCase() != 'h3' ) return;
if ( $(el).next().next().get(0).tagName.toLowerCase() != 'table' ) return;
var p = document.createElement('div');
$(el).clone().appendTo(p);
$(el).next().clone().appendTo(p);
$(el).next().next().clone().appendTo(p);
var prev = $(el).prev();
$(el).next().next().remove();
$(el).next().remove();
$(el).remove();
$(p).css('page-break-inside', 'avoid');
// $(p).css('border', '1px black solid'); // For debugging
$(p).insertAfter(prev);
})
$('h2, h3').each(function(i, el) {
if ( $(el).next().get(0).tagName.toLowerCase() != 'table' ) return;
var p = document.createElement('div');
$(el).clone().appendTo(p);
$(el).next().clone().appendTo(p);
var prev = $(el).prev();
var lesson = $(el).next().remove();
var sub = $(el).remove();
$(p).css('page-break-inside', 'avoid');
// $(p).css('border', '1px black solid'); // For debugging
$(p).insertAfter(prev);
})
})
// Restyling
var font = page.evaluate(function() {
$('html').css('zoom', 0.68)
$('body').css('background', 'none');
$('main').css('margin-top', '0px').css('border-radius', '0px')
return $('p').css('font');
})
var fullTitle = page.evaluate(function() {
return $('h1').get(0).textContent;
})
console.log(fullTitle);
window.setTimeout(function () {
page.render(fullTitle + '.pdf');
phantom.exit();
}, 200);
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment