Last active
August 12, 2020 13:03
-
-
Save magician11/a979906401591440bd6140bd14260578 to your computer and use it in GitHub Desktop.
How to grab the page source from any dynamically generated webpage and then process it .
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const CDP = require('chrome-remote-interface'); | |
const chromeLauncher = require('chrome-launcher'); | |
const cheerio = require('cheerio'); | |
(async function() { | |
const launchChrome = () => | |
chromeLauncher.launch({ chromeFlags: ['--disable-gpu', '--headless'] }); | |
const chrome = await launchChrome(); | |
const protocol = await CDP({ port: chrome.port }); | |
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms)); | |
// See API docs: https://chromedevtools.github.io/devtools-protocol/ | |
const { Page, Runtime, DOM } = protocol; | |
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]); | |
Page.navigate({ url: 'https://www.sfcinemacity.com/showtime/cinema/9936' }); | |
// wait until the page says it's loaded... | |
Page.loadEventFired(async () => { | |
try { | |
console.log('Page loaded! Now waiting a few seconds for all the JS to load...'); | |
await timeout(3000); // give the JS some time to load | |
console.log('Selecting English..'); | |
// first set the language to English | |
const result = await Runtime.evaluate({ | |
expression: | |
"document.querySelector('.lang-switcher li:nth-of-type(2) a').click()" | |
}); | |
// get the page source | |
const rootNode = await DOM.getDocument({ depth: -1 }); | |
const pageSource = await DOM.getOuterHTML({ | |
nodeId: rootNode.root.nodeId | |
}); | |
protocol.close(); | |
chrome.kill(); | |
// load the page source into cheerio | |
console.log('Processing page source...'); | |
const $ = cheerio.load(pageSource.outerHTML); | |
// perform queries | |
console.log('Getting movie times for', $('.showtime-cinema-name').text()); | |
$('.showtime-box').each((i, movieElement) => { | |
console.log($(movieElement).find('.movie-detail .name').text()); | |
}); | |
} catch (err) { | |
console.log(err); | |
} | |
}); | |
})(); |
Thanks, that's a great starting point!
Is there a way to wait until the process is done? I added a return Promise.resolve()
at the end and return Page.loadEventFired...
, but that doesn't seem to work.
example that waits until return
:
const CDP = require('chrome-remote-interface')
async function x () {
let protocol
try {
protocol = await CDP()
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms))
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()])
Page.navigate({ url: 'http://example.com' })
// wait until the page says it's loaded...
await Page.loadEventFired()
console.log('Page loaded! Now waiting a few seconds for all the JS to load...')
await timeout(3000) // give the JS some time to load
protocol.close()
console.log('Processing page source...')
console.log('Doing some fancy stuff here ...')
console.log('All done.')
return Promise.resolve()
} finally {
if (protocol) {
protocol.close()
}
}
}
(async function () {
console.log('start')
await x()
console.log('end')
})()
Yes using async/await is the way to go. One of the modules I wrote does it that way. Check out this file. Source code copied below..
const CDP = require('chrome-remote-interface');
const chromeLauncher = require('chrome-launcher');
const cheerio = require('cheerio');
const timeout = ms =>
new Promise(resolveTimeout => setTimeout(resolveTimeout, ms));
const getShowtimes = async (movieTheatreId, dayOffset = 0) => {
try {
// First scrape the showtime data using Google Chrome from the SF Cinemacity website
const launchChrome = () =>
chromeLauncher.launch({
chromeFlags: ['--disable-gpu', '--headless', '--no-sandbox']
});
const chrome = await launchChrome();
const protocol = await CDP({ port: chrome.port });
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol;
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()]);
await Page.navigate({
url: `https://www.sfcinemacity.com/showtime/cinema/${movieTheatreId}`
});
// wait until the page says it's loaded...
await Page.loadEventFired();
await timeout(3000); // give the JS some time to load
// first set the language option to English, to convert the content to English
await Runtime.evaluate({
expression:
"document.querySelector('.lang-switcher li:nth-of-type(2) a').click()"
});
// click the date we want to get showtimes for
await Runtime.evaluate({
expression: `document.querySelector('[data-slick-index="${
dayOffset
}"]').click()`
});
// get the page source
const rootNode = await DOM.getDocument({ depth: -1 });
const pageSource = await DOM.getOuterHTML({
nodeId: rootNode.root.nodeId
});
protocol.close();
chrome.kill();
// load the page source into cheerio
const $ = cheerio.load(pageSource.outerHTML);
// now process that HTML
const movieTheatreData = {
date: $('.slick-slide.selected .date').text(),
movieTheatreName: $('.showtime-cinema-name').text(),
movieTheatreId,
movies: []
};
// for each movie showing on this day at this movie theatre..
$('.showtime-box').each((movieIndex, movieNode) => {
// collate all the cinemas it's showing at (the showtimes and language per cinema)
const cinemas = [];
$(movieNode)
.find('.showtime-item')
.each((cinemaIndex, cinemaNode) => {
cinemas.push({
language: $(cinemaNode)
.find('.right-section .list-item')
.first()
.text()
.split(' ')[1]
.slice(0, -1),
times: $(cinemaNode)
.find('.time-list .time-item')
.map((index, el) => $(el).text())
.get()
.join()
});
});
// then finally capture the title, the rating, and the cinema showtimes collated above
movieTheatreData.movies.push({
movieTitle: $(movieNode)
.find('.movie-detail .name')
.text(),
rating: $(movieNode)
.find('.movie-detail .movie-detail-list .list-item')
.first()
.text()
.split('Rate: ')[1],
cinemas
});
});
return movieTheatreData;
} catch (err) {
reject(`Error scraping movie data from SF Cinema City: ${err}`);
}
};
module.exports = {
getShowtimes
};
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Details on how this all works here: https://golightlyplus.com/grab-page-source-dynamically-generated-webpage-process/