Last active
June 14, 2023 12:39
-
-
Save TimvanScherpenzeel/1eed0682d68bf126801b1aeb3895c15d to your computer and use it in GitHub Desktop.
Website scraping using Puppeteer and Node.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Native | |
const fs = require('fs'); | |
const path = require('path'); | |
// Vendor | |
const { ArgumentParser } = require('argparse'); | |
const fetch = require('node-fetch'); | |
const mkdirp = require('mkdirp'); | |
const puppeteer = require('puppeteer'); | |
// Argument parser | |
const createParserArguments = () => { | |
const parser = new ArgumentParser({ | |
addHelp: true, | |
}); | |
parser.addArgument(['-i', '--input'], { | |
help: 'Webpage you would like to save', | |
required: true, | |
}); | |
parser.addArgument(['-o', '--output'], { | |
help: 'Output directory you would like to save to', | |
defaultValue: 'output', | |
required: false, | |
}); | |
const args = parser.parseArgs(); | |
return args; | |
} | |
const args = createParserArguments(); | |
// Scraper | |
(async () => { | |
const browser = await puppeteer.launch({ | |
headless: false, | |
}); | |
const page = await browser.newPage(); | |
await page.setRequestInterception(true); | |
// Write base index.html | |
mkdirp(`${args.output}`, error => { | |
if (error) { | |
console.error(error); | |
} else { | |
fetch(args.input) | |
.then(response => response.buffer()) | |
.then(buffer => { | |
console.log(`Wrote to ${args.output}/index.html`); | |
const fileStream = fs.createWriteStream(`${args.output}/index.html`); | |
fileStream.write(buffer); | |
fileStream.end(); | |
}); | |
} | |
}); | |
// Write all assets to their correct folders | |
page.on('request', interceptedRequest => { | |
const request = interceptedRequest.url(); | |
// construct path from after input url | |
// http://experience.example.com/img/ to /img/ | |
const pathname = (path.parse(decodeURI(request)).dir).split(args.input)[1]; | |
// strip versioning from files because they don't work in a filesystem | |
// main.min.js?v=1481729814779 to main.min.js | |
const filename = (path.parse(request).base).split('?')[0]; | |
// Only handle root files or files in the website file system (so available on the domain) | |
if (pathname !== undefined || args.input.replace(/\/$/, '') === path.parse(decodeURI(request)).dir) { | |
// Files in the root appear as undefined but should be handled | |
const decodedPath = pathname ? pathname : ''; | |
mkdirp(`${args.output}/${decodedPath}`, error => { | |
if (error) { | |
console.error(error); | |
} else { | |
fetch(request) | |
.then(response => response.buffer()) | |
.then(buffer => { | |
console.log(`Wrote to ${args.output}/${decodedPath}/${filename}`); | |
const fileStream = fs.createWriteStream(`${args.output}/${decodedPath}/${filename}`); | |
fileStream.write(buffer); | |
fileStream.end(); | |
}); | |
} | |
}); | |
} | |
interceptedRequest.continue(); | |
}); | |
await page.goto(args.input); | |
await page.waitForNavigation({ waitUntil: 'networkidle0' }), | |
await browser.close(); | |
})(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "scraper", | |
"version": "0.0.1", | |
"description": "", | |
"main": "index.js", | |
"author": "Tim van Scherpenzeel", | |
"license": "MIT", | |
"dependencies": { | |
"argparse": "^1.0.10", | |
"fs-extra": "^6.0.1", | |
"mkdirp": "^0.5.1", | |
"node-fetch": "^2.1.2", | |
"puppeteer": "^1.4.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Excellent!