Created
December 29, 2018 05:23
-
-
Save ShaunLWM/65ec009b65463fe10d8fa45f4a48220a to your computer and use it in GitHub Desktop.
pcgamesdownload scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const request = require('request'); | |
const cheerio = require('cheerio'); | |
const async = require('async'); | |
const fs = require('fs'); | |
const JobManager = require('./JobManager'); | |
let job = new JobManager(); | |
let page = 1; | |
async.whilst( | |
function () { return page < 297; }, | |
function (callback) { | |
requestPage(`https://pcgames-download.com/page/${page}/`, (error, response, body) => { | |
if (error) { | |
console.log('----- ERROR ------'); | |
console.log(`https://pcgames-download.com/page/${page}/`); | |
console.error(error); | |
console.log('----------------'); | |
return callback(null); | |
} | |
if (response.statusCode !== 200) { | |
console.log('----- ERROR RESPONSE ------'); | |
console.log(`https://pcgames-download.com/page/${page}/`); | |
console.error(response.statusCode); | |
console.log('----------------'); | |
return callback(null); | |
} | |
fs.writeFileSync(`./pages/${page}.html`, body); | |
let $ = cheerio.load(body); | |
$('.post-container').each((index, element) => { | |
let url = $(element).find('.post-title > a').attr('href'); | |
console.log(url); | |
job.addPageLink(url); | |
}); | |
page++; | |
setTimeout(() => { | |
console.log('------------------------------'); | |
return callback(null); | |
}, 2000); | |
}); | |
}, | |
function (err, n) { | |
console.log('Done'); | |
} | |
); | |
function requestPage(url, callback) { | |
const options = { | |
url, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' | |
} | |
}; | |
return request(options, callback); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Queue = require('bull'); | |
const request = require('request'); | |
const cheerio = require('cheerio'); | |
const fs = require('fs'); | |
const Arena = require('bull-arena'); | |
const express = require('express'); | |
const app = express(); | |
let arena = Arena({ | |
queues: [{ | |
name: 'links queue', | |
"hostId": "Downloader" | |
}] | |
}); | |
app.use('/', arena); | |
app.listen(8081, () => console.log(`>> [jm] queue server listening on port 8081`)); | |
class JobManager { | |
constructor() { | |
this.pageQueue = new Queue('links queue'); | |
this.pageQueue.process((job, done) => { | |
let url = job.data.url; | |
console.log(`Processing: ${url}`); | |
this.requestPage(url, (error, response, body) => { | |
if (error) { | |
console.log(url); | |
console.error(error); | |
return done(); | |
} | |
if (response.statusCode !== 200) { | |
console.log(url); | |
console.error(`statusCode ${response.statusCode}`); | |
return done(); | |
} | |
let $ = cheerio.load(body); | |
let title = $('.post-title').text(); | |
let links = []; | |
$('.post').find('a').each((i, element) => { | |
let p = $(element).attr('href'); | |
if (typeof p !== 'undefined' && p !== null && !p.startsWith('https://pcgames-download') && !p.startsWith('http://pcgames-download.net')) { | |
links.push(p); | |
} | |
}); | |
if (links.length > 0) { | |
let s = new URL(url); | |
let l = s.pathname.split('/'); | |
fs.writeFileSync(`./pages/individual/${l[3]}.html`, body); | |
let file = JSON.parse(fs.readFileSync('./links.json')); | |
file.push({ | |
title, links | |
}); | |
fs.writeFileSync('./links.json', JSON.stringify(file, null, 2)); | |
} | |
return done(); | |
}); | |
}); | |
} | |
addPageLink(url) { | |
this.pageQueue.add({ url }); | |
} | |
requestPage(url, callback) { | |
const options = { | |
url, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' | |
} | |
}; | |
return request(options, callback); | |
} | |
} | |
module.exports = JobManager; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "pcgames-download", | |
"version": "1.0.0", | |
"main": "index.js", | |
"license": "MIT", | |
"dependencies": { | |
"async": "^2.6.1", | |
"bull": "^3.5.2", | |
"bull-arena": "^2.5.2", | |
"cheerio": "^1.0.0-rc.2", | |
"request": "^2.88.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment