Created
August 11, 2019 22:14
-
-
Save cchudant/40da3f41186453c0cd807e9bce5a5d15 to your computer and use it in GitHub Desktop.
Scrap nyaa comments and expose them as an RSS feed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const httpFetch = require('node-fetch') | |
const { JSDOM } = require('jsdom') | |
const express = require('express') | |
const morgan = require('morgan') | |
const { Feed } = require('feed') | |
const xmlEscape = require('xml-escape') | |
const wait = t => new Promise(r => setTimeout(r, t)) | |
const { | |
PORT = 8080 | |
} = process.env | |
const app = express() | |
app.use(morgan('dev')) | |
/// global | |
const UPDATE_INTERVAL = 3 * 60 * 1000 | |
const UPLOADERS = ['Punisher694'] | |
const cache = new Map() | |
/// limit network requests | |
let lastNetRsq = Promise.resolve() | |
function fetch(...args) { | |
const lastrsq = lastNetRsq | |
const promise = Promise.resolve() | |
.then(async () => { | |
await lastrsq | |
console.log('now requesting') | |
return await httpFetch(...args) | |
}) | |
lastNetRsq = promise | |
return promise | |
} | |
/// logic | |
function commentsState(document) { | |
return Array.from(document.querySelectorAll('tr > td:nth-child(2)')) | |
.map(tr => tr.children) | |
.map(([acom, alink]) => ({ | |
title: (alink || acom).textContent, | |
link: (alink || acom).href, | |
comments: alink ? +acom.textContent : 0 | |
})) | |
.map(o => [o.link, o]) | |
.reduce((o, [k, v]) => (o[k] = v, o), {}) // Object.fromEntries | |
} | |
function commentsDiff(lastState, newState) { | |
return Object.entries(newState) | |
.map(([link, { comments, ...fields }]) => | |
({ comments: comments - ((lastState[link] || {}).comments || 0), ...fields }) | |
) | |
.filter(({ comments }) => comments > 0) | |
} | |
function comments(document) { | |
return Array.from(document.querySelectorAll('#collapse-comments > div > div')) | |
.map(div => ({ | |
user: div.querySelector('a').textContent, | |
avatar: div.querySelector('img.avatar').src, | |
commentLink: div.querySelector('div.comment-details > a').href.slice('about:blank'.length), | |
date: new Date(div.querySelector('div.comment-details > a > small').dataset['timestamp'] * 1000), | |
content: div.querySelector('div.comment-content').innerHTML, | |
})) | |
} | |
/// network out | |
async function getStateFor(user) { | |
const link = `https://nyaa.si/user/${user}?s=comments&o=desc` | |
console.log(`Requesting comments state for ${user}`) | |
const res = await fetch(link) | |
if (!res.ok) | |
{ | |
console.error(`Error for url /user/${user}?s=comments&o=desc: status ${res.status}`) | |
return { status: res.status === 404 ? 'not-found' : 'error' } | |
} | |
const text = await res.text() | |
const dom = new JSDOM(text) | |
const document = dom.window.document | |
try { | |
return { status: 'ok', state: commentsState(document) } | |
} catch (e) { | |
console.error(`Error for user ${user}`, e) | |
return { status: 'error' } | |
} | |
} | |
async function getComments(partialUrl) { | |
const link = `https://nyaa.si${partialUrl}` | |
console.log(`Requesting comments for ${partialUrl}`) | |
const res = await fetch(link) | |
if (!res.ok) | |
{ | |
console.error(`Error for url ${partialUrl}: status ${res.status}`) | |
return { status: res.status === 404 ? 'not-found' : 'error' } | |
} | |
const text = await res.text() | |
const dom = new JSDOM(text) | |
const document = dom.window.document | |
try { | |
return { status: 'ok', comments: comments(document) } | |
} catch (e) { | |
console.error(`Error for url ${partialUrl}`, e) | |
return { status: 'error' } | |
} | |
} | |
/// update state | |
function logsRss(user, logs) { | |
const feed = new Feed({ | |
title: `Nyaa.si ${user} comment feed`, | |
id: `https://nyaa.si/user/${user}`, | |
link: `https://nyaa.si/user/${user}`, | |
description: `Comment feed for nyaa.si user ${user}` | |
}) | |
logs.forEach(({ title, link, avatar, content, date, user, commentLink }) => | |
feed.addItem({ | |
title: `${user} on ${title}`, | |
image: xmlEscape(avatar), // the `feed` lib is baad | |
date, | |
link: `https://nyaa.si${link}${commentLink}`, | |
author: [{ | |
name: user, | |
email: '[email protected]', | |
link: `https://nyaa.si/user/${user}` | |
}], | |
content, | |
description: content | |
}) | |
) | |
return feed.atom1() | |
} | |
async function updateState(user) { | |
let { | |
state: cached = {}, | |
logs = [] | |
} = cache.get(user) || {} | |
const { status, state } = await getStateFor(user) | |
if (!status) return | |
const diff = commentsDiff(cached, state) | |
const update = (await Promise.all( | |
diff.map(obj => getComments(obj.link).then(res => [obj, res])) | |
)) | |
.filter(([, { status }]) => status === 'ok') | |
.map(([obj, { comments }]) => ({ ...obj, comments })) | |
const newLogs = | |
update.map(({ comments, ...torrFields }) => comments.map(comm => ({ ...torrFields, ...comm }))) | |
.reduce((acc, cur) => [...acc, ...cur], []) // Array.prototype.flat() | |
logs = [...newLogs, ...logs] | |
.sort(({ date: d1 }, { date: d2 }) => d2 - d1) | |
cache.set(user, { state, logs }) | |
} | |
UPLOADERS.reduce(async (promise, user) => { | |
await promise | |
const fn = async () => { | |
console.log(`Updating state for ${user}...`) | |
await updateState(user) | |
console.log(`Updated state for ${user}.`) | |
} | |
setImmediate(fn) | |
setInterval(fn, UPDATE_INTERVAL) | |
await wait(UPDATE_INTERVAL / UPLOADERS.length) | |
}, Promise.resolve()) | |
/// network in | |
app.get('/user/:user', async (req, res) => { | |
const { user } = req.params | |
const { logs } = cache.get(user) || {} | |
if (!logs) | |
return res.sendStatus(500) | |
res.set('Content-Type', 'application/rss+xml'); | |
res.send(logsRss(user, logs)) | |
}) | |
app.listen(PORT, () => console.log(`App running on port ${PORT}!`)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment