Created
October 12, 2018 20:25
-
-
Save luizpvas/a7a5a096892d666ff61c1cbc925c8f74 to your computer and use it in GitHub Desktop.
Extração de comentários do Podcast Hipsters.tech
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
// Ids of the episodes. Each ID must be passed to the URL in the format of | |
// hipsters.tech?page_id=<page_id>. | |
// | |
// I grabbed these ids calling `Array.from(document.querySelectorAll('#page_id option')).map(option => option.value)` | |
// in any episode page. The episodes are listed in a select input, which was really handy. | |
let pageIds = ["1940","1932","1915","1923","1910","1901","1893","1884","1877","1868","1864","1854","1847","1823","1800","1791","1785","1763","1757","1749","1740","1736","1729","1718","1708","1697","1689","1684","1676","1661","1646","1642","1635","1623","1614","1591","1586","1571","1548","1526","1508","1475","1468","1445","1416","1404","1391","1371","1355","1339","1326","1298","1281","1262","1245","1067","1040","1141","1129","1115","1005","1058","1045","1030","1015","1007","984","989","963","948","939","930","923","917","887","878","871","866","859","835","827","816","806","793","778","670","680","689","718","685","683","667","639","638","631","614","566","579","471","558","469","467","521","509","491","449","394","428","412","373","372","330","332","315","307","271","263","223"]; | |
// Run the crawler for each page | |
(async () => { | |
for(let i = 0; i < pageIds.length; i++) { | |
console.log(`Scraping page ${i} / ${pageIds.length}`) | |
try { | |
await scrapePage(pageIds[i]) | |
} catch(err) { | |
console.log(err) | |
} | |
} | |
})(); | |
// Run the scrapper | |
async function scrapePage(pageId) { | |
const browser = await puppeteer.launch() | |
const page = await browser.newPage() | |
await page.setViewport({width: 1024, height: 800}) | |
console.log(`Visiting episode ${pageId}...`) | |
await page.goto('https://hipsters.tech?page_id=' + pageId) | |
// Scroll to the comments element so Disqus renders the iframe | |
await page.evaluate(async () => { | |
let commentsContainer = document.querySelector('#comments') | |
let scrollTop = commentsContainer.getBoundingClientRect().top | |
window.scrollTo(0, scrollTop) | |
}) | |
// Wait for the iframe to become visible | |
await page.waitForSelector('#comments iframe'); | |
// Get the iframe's URL, this is where we'll extract the comments from. | |
let { disqusUrl, title, tags, participants } = await page.evaluate(() => { | |
let disqusUrl = document.querySelector('#comments iframe').getAttribute('src') | |
let title = document.querySelector('.post-title h1').innerText | |
let tags = Array.from(document.querySelectorAll('.post-meta li>a')).filter(link => { | |
return link.href.indexOf('category') !== -1 | |
}).map(link => { | |
return link.innerText | |
}) | |
let participants = Array.from( | |
document.querySelectorAll('.entry-content ul')[1].querySelectorAll('li') | |
).map(li => { | |
return li.innerText.split(',')[0] | |
}) | |
return { disqusUrl, title, tags, participants } | |
}) | |
// Visit the Disqus page and wait for the posts to be rendered | |
await page.goto(disqusUrl) | |
await page.waitForSelector('.post-list') | |
while(true) { | |
console.log('Loading comments...') | |
await page.waitFor(5000) | |
let hasMoreComments = await page.evaluate(() => { | |
let loadMoreButton = document.querySelector('[data-action="more-posts"]') | |
if(loadMoreButton && loadMoreButton.offsetParent) { | |
loadMoreButton.click() | |
return true | |
} | |
}) | |
if(!hasMoreComments) { | |
break | |
} | |
} | |
// Grab the comments + authors | |
let comments = await page.evaluate(() => { | |
return Array.from(document.querySelectorAll('li.post')).map(post => { | |
return { | |
postId: post.id, | |
username: post.querySelector('a[data-username]').getAttribute('data-username'), | |
authorName: post.querySelector('.author').innerText, | |
comment: post.querySelector('.post-message').innerText, | |
} | |
}) | |
}) | |
console.log(`Found ${comments.length} comments for the episode ${title}`) | |
fs.writeFileSync('./episodes/' + pageId, JSON.stringify({ title, comments, tags, participants })) | |
console.log("ALL DONE!") | |
await browser.close() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment