Skip to content

Instantly share code, notes, and snippets.

@avnersorek
Created August 20, 2025 08:17
Show Gist options
  • Save avnersorek/41f2a8b4e5c3234ab79cbeae0d97a3f4 to your computer and use it in GitHub Desktop.
Save avnersorek/41f2a8b4e5c3234ab79cbeae0d97a3f4 to your computer and use it in GitHub Desktop.
A Reddit subreddit scraper that reads posts and comments and outputs a text file.
/**
The script is designed to be respectful of Reddit's API limits while comprehensively collecting both posts
and comments for offline analysis or archival purposes.
I have used it to scrape subreddits and throw the text in NotebookLLM.
Takes like 30 minutes to download a subreddit (depending on how busy it is) since there are a lot of rate limits.
1. Scrapes recent posts from a specified subreddit (last 30 days) using Reddit's JSON API
2. Fetches comments for each post along with the post content
3. Saves everything to a text file named {subreddit}_posts.txt with formatted content including:
• Post title, author, URL, score, and text
• All comments with their authors
4. Handles rate limiting with delays (2 seconds between pages, 10-30 seconds on rate limit errors)
5. Paginates through results to get all posts from the past 30 days
6. Takes subreddit name as a command-line argument (defaults to 'nodejs' if none provided)
**/
const axios = require('axios');
const fs = require('fs');
const path = require('path');
// Function to calculate the timestamp for 30 days ago
function getThirtyDaysAgoTimestamp() {
const d = new Date();
d.setDate(d.getDate() - 30);
return d.getTime() / 1000;
}
// A simple delay function
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
// Function to fetch all comments for a post
async function getComments(permalink) {
const commentsUrl = `https://www.reddit.com${permalink}.json`;
try {
const response = await axios.get(commentsUrl, {
headers: {
'User-Agent': 'MyRedditScraper/1.0'
}
});
const commentsData = response.data[1].data.children;
let comments = '';
if (commentsData && commentsData.length > 0) {
commentsData.forEach(comment => {
if (comment.kind === 't1') {
comments += ` Comment by u/${comment.data.author}:\n`;
comments += ` ${comment.data.body}\n\n`;
}
});
}
return comments;
} catch (error) {
console.error(`Error fetching comments for ${permalink}:`, error.message);
// Add a delay on error to avoid further rate-limiting
if (error.response && error.response.status === 429) {
console.log("Rate limit hit. Waiting for 10 seconds before retrying...");
await delay(10000); // Wait for 10 seconds
return getComments(permalink); // Retry the request
}
return '';
}
}
// Main function to scrape the subreddit
async function scrapeSubreddit(subredditName) {
const thirtyDaysAgo = getThirtyDaysAgoTimestamp();
const fileName = path.join(__dirname, `${subredditName}_posts.txt`);
// Clear the file if it already exists
if (fs.existsSync(fileName)) {
fs.unlinkSync(fileName);
}
let after = null;
let keepFetching = true;
console.log(`Scraping r/${subredditName} and writing to ${fileName}...`);
while (keepFetching) {
let url = `https://www.reddit.com/r/${subredditName}/new.json?limit=100`;
if (after) {
url += `&after=${after}`;
}
try {
const response = await axios.get(url, {
headers: {
'User-Agent': 'MyRedditScraper/1.0'
}
});
const posts = response.data.data.children;
if (posts.length === 0) {
break; // No more posts to fetch
}
for (const post of posts) {
const postData = post.data;
if (postData.created_utc < thirtyDaysAgo) {
keepFetching = false;
break;
}
const commentsContent = await getComments(postData.permalink);
const content = `--- Post Title: ${postData.title} ---\n` +
`Author: ${postData.author}\n` +
`URL: ${postData.url}\n` +
`Score: ${postData.score}\n` +
`Text: ${postData.selftext}\n\n` +
`--- Comments ---\n` +
commentsContent +
`\n${'='.repeat(50)}\n\n`;
fs.appendFileSync(fileName, content, 'utf-8');
}
after = response.data.data.after;
// Add a short delay between pages to be more respectful of the API rate limits
console.log("Page scraped. Waiting for 2 seconds before fetching the next page...");
await delay(2000);
} catch (error) {
console.error(`An error occurred while scraping:`, error.message);
if (error.response && error.response.status === 429) {
console.log("Rate limit hit on main loop. Waiting for 30 seconds before retrying...");
await delay(30000); // Wait for a longer period on the main loop
} else {
break; // Exit on other errors
}
}
}
console.log(`Scraping complete. The data has been saved to ${fileName}.`);
}
const subredditToScrape = process.argv[2] || 'nodejs';
if (subredditToScrape) {
scrapeSubreddit(subredditToScrape);
} else {
console.log("Please provide a subreddit name as a command-line argument. e.g., node scrape.js javascript");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment