Last active
January 15, 2021 09:55
-
-
Save sweemeng/8ff600643c7c76b4a5a6f12a4cb8b00a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer') | |
const fs = require('fs') | |
const extractor = async url => { | |
const browser = await puppeteer.launch({ | |
headless: true, | |
slowMo: 1500 | |
}) | |
const scrapeTime = Date.now() | |
const page = await browser.newPage() | |
await page.goto(url) | |
await page.setViewport({ width: 1848, height: 949 }) | |
var results = await page.evaluate(() => { | |
var data = [] | |
var name = document.querySelector("#divToBePrinted h3").textContent | |
var image = document.querySelector("#divToBePrinted img").src | |
var foodGroup = document.querySelector("#divToBePrinted table tr td:nth-child(3)").textContent | |
var foodTable = document.querySelector('#divToBePrinted #tableDetailNutrient') | |
if(foodTable.querySelector("thead > tr").childElementCount == 4){ | |
var unitWeight = foodTable.querySelector("thead > tr > th:nth-child(4)").textContent.clean().slice(2) | |
} | |
else { | |
var unitWeight = null | |
} | |
foodTable.querySelectorAll("tbody > tr").forEach(item => { | |
if (item.childElementCount == 1) { | |
var value = item.querySelector("td").textContent | |
data.push({name: value}) | |
} | |
else if(item.childElementCount == 4) { | |
data.push({ | |
name: item.querySelector("td:nth-child(1)").textContent, | |
unit: item.querySelector("td:nth-child(2)").textContent, | |
value_per_100g: item.querySelector("td:nth-child(3)").textContent, | |
one_tbsp: item.querySelector("td:nth-child(4").textContent | |
}) | |
} | |
else { | |
data.push({ | |
name: item.querySelector("td:nth-child(1)").textContent, | |
unit: item.querySelector("td:nth-child(2)").textContent, | |
value_per_100g: item.querySelector("td:nth-child(3)").textContent, | |
}) | |
} | |
}) | |
return { | |
name: name, | |
image: image, | |
foodGroup: foodGroup, | |
unitWeight: unitWeight, | |
data: data, | |
source: url, | |
date: scrapeTime.toLocaleString('en-GB', { timeZone: 'Asia/Kuala_Lumpur' }) | |
} | |
}); | |
var file_name = "food_data/" + results.name.replace("/", "").replaceAll(",", "").replaceAll(" ", "_").replaceAll("(", "").replaceAll(")","") + ".json" | |
let data = JSON.stringify(results, null, 2) | |
fs.writeFileSync(file_name, data) | |
await browser.close() | |
} | |
runExtractor = async urllist => { | |
for (let i=0; i < urllist.length; i++) { | |
const url = urllist[i] | |
console.log("processing " + url) | |
await extractor(url) | |
console.log("done" + url) | |
} | |
} | |
let rawlist = fs.readFileSync("food_links.json") | |
let urllist = JSON.parse(rawlist) | |
runExtractor(urllist) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
(async () => { | |
const browser = await puppeteer.launch({ | |
headless: true, | |
slowMo: 1500 | |
}) | |
const page = await browser.newPage() | |
var results = [] | |
await page.goto('https://myfcd.moh.gov.my/myfcdcurrent/') | |
await page.setViewport({ width: 1848, height: 949 }) | |
while(true) { | |
var foods = await page.evaluate(() => { | |
var foodRows = document.querySelectorAll('#tblDataProduct > tbody > tr > td:first-child > a'); | |
var links = []; | |
foodRows.forEach(value => { | |
links.push(value.href); | |
}); | |
return links; | |
}); | |
results = results.concat(foods); | |
await page.waitForSelector('#tab1 > #tblDataProduct_wrapper > .bottom > .dataTables_paginate > .next') | |
var nextButton = await page.$('#tab1 > #tblDataProduct_wrapper > .bottom > .dataTables_paginate > .next') | |
var nextClasses = await page.evaluate(el => el.className.split(" "), nextButton); | |
if(nextClasses.find(s => s == "disabled")){ | |
break | |
} | |
else { | |
await page.click('#tab1 > #tblDataProduct_wrapper > .bottom > .dataTables_paginate > .next') | |
} | |
} | |
let data = JSON.stringify(results, null, 2) | |
fs.writeFileSync('food_links.json', data) | |
await browser.close() | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment