Last active
March 13, 2021 04:44
-
-
Save hellonearthis/e1a418e7e84399a92b8177cfb92bda81 to your computer and use it in GitHub Desktop.
node + puppeteer scraping and processing of a paginated table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "pup_scrapper", | |
"version": "1.0.0", | |
"description": "puppeter vs govt", | |
"main": "ps.js", | |
"scripts": { | |
"start": "node ps.js" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"d3": "^6.6.0", | |
"jsonfile": "^5.0.0", | |
"lodash": "^4.17.21", | |
"puppeteer": "^8.0.0" | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
https://www.youtube.com/watch?v=IvaJ5n5xFqU looping through page content | |
for testing the scraper without bugginh the site: | |
https://stackoverflow.com/questions/47587352/opening-local-html-file-using-puppeteer | |
await page.goto('file://C:/Users/compoundeye/test.html'); | |
to view https://www.youtube.com/watch?v=crKJ2hGcQ3Q&list=PLw5h0DiJ-9PDTSsOmwZ0DhzPt2yQ6RY9z | |
*/ | |
const _ = require("lodash"); | |
const jsonfile = require("jsonfile"); | |
const puppeteer = require("puppeteer"); | |
async function run() { | |
try { | |
const browser = await puppeteer.launch({ | |
headless: true , | |
timeout: 0, | |
// slowMo: 250 | |
}); | |
const page = await browser.newPage(); | |
await page.setUserAgent('5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'); | |
await page.setViewport({ width: 1280, height: 1080 }) | |
let baseURL = 'https://nzhistory.govt.nz/politics/womens-suffrage/petition?page=' | |
await page.setDefaultNavigationTimeout(0); | |
let PG =[] // save the pages | |
for (let pageNum = 0; pageNum < 600; pageNum++){ //600 - loop from 0 to 599 to read the full table | |
console.log(`reading ${baseURL}${pageNum} \n`) | |
await page.goto(`${baseURL}${pageNum}`,{ waitUntil: 'networkidle2' }); // ,{ timeout: 8000, waitUntil: 'domcontentloaded' } waitUntil: 'networkidle2' | |
// .waitForSelector('#myId') | |
let data = await page.evaluate(async () => { | |
return Array.from(document.querySelectorAll('div.view-content > table > tbody > tr')) | |
.map((d) => ({data:d.innerHTML}) | |
); | |
}) | |
let Ra = data.forEach(d => { | |
let l = _.split(_.trim(d.data), '</td>'); // i address each element of the split in a hard coded way based on it's structure. | |
let sheet = _.trim(l[0].slice(l[0].search('">') + 2, l[0].length)) // pull off sheet | |
let surname = _.trim(l[1].slice(l[1].search('">') + 2, l[1].length)) // pull off surname | |
let givenNames = _.trim(l[2].slice(l[2].search('">') + 2, l[2].length)) // pull off given names | |
let originalAddress = _.trim(l[3].slice(l[3].search('">') + 2, l[3].length)) // pull off original address | |
let consistentTownSuburb = _.trim(l[4].slice(l[4].search('">') + 2, l[4].length)) // pull off consistent town suburb | |
let consistentCityRegion = _.trim(l[5].slice(l[5].search('">') + 2, l[5].length)) // pull off consistent city region | |
let sigIMG = _.trim(l[6].slice(l[6].search('<a'), l[6].length)) // pull off link to sig image | |
PG.push({sheetURL: sheet, | |
surname: surname, | |
givenNames: givenNames, | |
originalAddress: originalAddress, | |
consistentTownSuburb: consistentTownSuburb, | |
consistentCityRegion: consistentCityRegion, | |
sigIMG: sigIMG | |
}) | |
return 'ok' | |
}); | |
await page.waitForTimeout(1000).then(() => console.log(`Next page ${pageNum+1}`)) | |
// await browser.close(); | |
// await page.close(); | |
} | |
jsonfile.writeFile('teJson.json', PG, function (err) { | |
if (err) console.error(err) | |
}) | |
console.log('FINISHED!') | |
} catch (e) { | |
console.log('the error ', e); | |
} | |
} | |
run(); | |
// run().then(console.log).catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment