Skip to content

Instantly share code, notes, and snippets.

@tkssharma
Created December 28, 2020 10:28
Show Gist options
  • Save tkssharma/402643c03e0d041e23eeedc5d7b20140 to your computer and use it in GitHub Desktop.
Save tkssharma/402643c03e0d041e23eeedc5d7b20140 to your computer and use it in GitHub Desktop.
const { startBrowser } = require('./browser');
const cheerio = require('cheerio');
async function scrapeData() {
const books = [];
const browser = await startBrowser();
const page = await browser.newPage();
await page.goto('http://books.toscrape.com/', { waitUntil: 'networkidle0' })
await page.waitForSelector('.page_inner');
// const content = await page.evaluate(() => document.body.innerHTML)
const content = await page.content();
const links = await extractData(content);
for(let link of links){
await page.goto(`http://books.toscrape.com/${link}`);
await page.waitForSelector('#content_inner');
const pageContent = await page.content();
const bookPayload = await extractPageData(pageContent);
books.push(bookPayload);
}
console.log(books);
await browser.close()
}
async function extractPageData(html){
const bookData = {};
const $ = cheerio.load(html);
$('.product_page').each((row, rawElement) => {
const image = $(rawElement).find('.thumbnail').find('img');
bookData.image = image.attr('src').trim();
const title = $(rawElement).find('.product_main').find('h1');
bookData.title = title.text();
const price = $(rawElement).find('.product_main').find('.price_color');
bookData.price = (price.text() && price.text().trim());
const desc = $(rawElement).find('p').text()
console.log(desc && desc.trim())
});
return bookData;
}
async function extractData(html){
const links = [];
const $ = cheerio.load(html);
$('.product_pod').each((row, rawElement) => {
const node = ($(rawElement).find('a'));
links.push(node.attr('href'));
});
return links;
}
scrapeData();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment