Created
May 7, 2021 16:07
-
-
Save MrOrz/31c881733fc9b9e9aa491940f8b02691 to your computer and use it in GitHub Desktop.
165 news crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @OnlyCurrentDoc | |
*/ | |
const DATA_SHEET = '165 民眾通報假投資/博弈詐騙網站'; | |
function getLastDate() { | |
const sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(DATA_SHEET); | |
const lastRow = sheet.getLastRow(); | |
return new Date(sheet.getRange(lastRow, 6).getValue() || 0); | |
} | |
function fetchArticles(fromDate) { | |
const resp = UrlFetchApp.fetch('https://165.npa.gov.tw/api/article/list/news', {method: 'get'}); | |
return JSON.parse(resp.getContentText()) | |
.filter(article => article.title.includes('民眾通報假投資') && new Date(article.publishDate) > fromDate) | |
.sort((a1, a2) => new Date(a1.publishDate) - new Date(a2.publishDate)); | |
} | |
function fetchArticle(articleId) { | |
const resp = UrlFetchApp.fetch(`https://165.npa.gov.tw/api/article/detail/news/${articleId}`, {method: 'get'}); | |
return JSON.parse(resp.getContentText()); | |
} | |
function getTableDataFromHTML(htmlString) { | |
// Sanitize &XXX; that will break XML parsing | |
const sanitizedHtml = `<body>${htmlString.replace(/&.+?;/g, ' ')}</body>`; | |
const document = XmlService.parse(sanitizedHtml); | |
return document.getDescendants().reduce((tableData, descendant) => { | |
if(descendant.getType() !== XmlService.ContentTypes.ELEMENT) return tableData; | |
const element = descendant.asElement(); | |
if(element.getName() !== 'tr') return tableData; | |
const [name, url, count] = element.getAllContent().filter(content => { | |
if(content.getType() !== XmlService.ContentTypes.ELEMENT) return false; | |
const elem = content.asElement(); | |
return elem.getName() === 'td'; | |
}).map(td => td.getValue()); | |
return [...tableData, { name, url, count: +count }]; | |
}, []).slice(1); // Skip header | |
} | |
function appendTableData(article, tableData) { | |
const sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(DATA_SHEET); | |
const lastRow = sheet.getLastRow(); | |
sheet.insertRowsAfter(lastRow, tableData.length); | |
const range = sheet.getRange(lastRow + 1, 1, tableData.length, 5); | |
range.setValues(tableData.map(({name, url, count}) => [article.id, name, url, count, article.publishDate])); | |
} | |
function main() { | |
const lastDate = getLastDate(); | |
const articles = fetchArticles(lastDate); | |
console.log(`Fetched ${articles.length} new article(s):`, articles.map(({id, title}) => `#${id}: ${title}`)); | |
articles.forEach(a => { | |
console.log(`Processing article #${a.id}`); | |
const article = fetchArticle(a.id); | |
const tableData = getTableDataFromHTML(article.content); | |
appendTableData(a, tableData); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment