Created
August 20, 2025 08:17
-
-
Save avnersorek/41f2a8b4e5c3234ab79cbeae0d97a3f4 to your computer and use it in GitHub Desktop.
A Reddit subreddit scraper that reads posts and comments and outputs a text file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
The script is designed to be respectful of Reddit's API limits while comprehensively collecting both posts | |
and comments for offline analysis or archival purposes. | |
I have used it to scrape subreddits and throw the text in NotebookLLM. | |
Takes like 30 minutes to download a subreddit (depending on how busy it is) since there are a lot of rate limits. | |
1. Scrapes recent posts from a specified subreddit (last 30 days) using Reddit's JSON API | |
2. Fetches comments for each post along with the post content | |
3. Saves everything to a text file named {subreddit}_posts.txt with formatted content including: | |
• Post title, author, URL, score, and text | |
• All comments with their authors | |
4. Handles rate limiting with delays (2 seconds between pages, 10-30 seconds on rate limit errors) | |
5. Paginates through results to get all posts from the past 30 days | |
6. Takes subreddit name as a command-line argument (defaults to 'nodejs' if none provided) | |
**/ | |
const axios = require('axios'); | |
const fs = require('fs'); | |
const path = require('path'); | |
// Function to calculate the timestamp for 30 days ago | |
function getThirtyDaysAgoTimestamp() { | |
const d = new Date(); | |
d.setDate(d.getDate() - 30); | |
return d.getTime() / 1000; | |
} | |
// A simple delay function | |
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); | |
// Function to fetch all comments for a post | |
async function getComments(permalink) { | |
const commentsUrl = `https://www.reddit.com${permalink}.json`; | |
try { | |
const response = await axios.get(commentsUrl, { | |
headers: { | |
'User-Agent': 'MyRedditScraper/1.0' | |
} | |
}); | |
const commentsData = response.data[1].data.children; | |
let comments = ''; | |
if (commentsData && commentsData.length > 0) { | |
commentsData.forEach(comment => { | |
if (comment.kind === 't1') { | |
comments += ` Comment by u/${comment.data.author}:\n`; | |
comments += ` ${comment.data.body}\n\n`; | |
} | |
}); | |
} | |
return comments; | |
} catch (error) { | |
console.error(`Error fetching comments for ${permalink}:`, error.message); | |
// Add a delay on error to avoid further rate-limiting | |
if (error.response && error.response.status === 429) { | |
console.log("Rate limit hit. Waiting for 10 seconds before retrying..."); | |
await delay(10000); // Wait for 10 seconds | |
return getComments(permalink); // Retry the request | |
} | |
return ''; | |
} | |
} | |
// Main function to scrape the subreddit | |
async function scrapeSubreddit(subredditName) { | |
const thirtyDaysAgo = getThirtyDaysAgoTimestamp(); | |
const fileName = path.join(__dirname, `${subredditName}_posts.txt`); | |
// Clear the file if it already exists | |
if (fs.existsSync(fileName)) { | |
fs.unlinkSync(fileName); | |
} | |
let after = null; | |
let keepFetching = true; | |
console.log(`Scraping r/${subredditName} and writing to ${fileName}...`); | |
while (keepFetching) { | |
let url = `https://www.reddit.com/r/${subredditName}/new.json?limit=100`; | |
if (after) { | |
url += `&after=${after}`; | |
} | |
try { | |
const response = await axios.get(url, { | |
headers: { | |
'User-Agent': 'MyRedditScraper/1.0' | |
} | |
}); | |
const posts = response.data.data.children; | |
if (posts.length === 0) { | |
break; // No more posts to fetch | |
} | |
for (const post of posts) { | |
const postData = post.data; | |
if (postData.created_utc < thirtyDaysAgo) { | |
keepFetching = false; | |
break; | |
} | |
const commentsContent = await getComments(postData.permalink); | |
const content = `--- Post Title: ${postData.title} ---\n` + | |
`Author: ${postData.author}\n` + | |
`URL: ${postData.url}\n` + | |
`Score: ${postData.score}\n` + | |
`Text: ${postData.selftext}\n\n` + | |
`--- Comments ---\n` + | |
commentsContent + | |
`\n${'='.repeat(50)}\n\n`; | |
fs.appendFileSync(fileName, content, 'utf-8'); | |
} | |
after = response.data.data.after; | |
// Add a short delay between pages to be more respectful of the API rate limits | |
console.log("Page scraped. Waiting for 2 seconds before fetching the next page..."); | |
await delay(2000); | |
} catch (error) { | |
console.error(`An error occurred while scraping:`, error.message); | |
if (error.response && error.response.status === 429) { | |
console.log("Rate limit hit on main loop. Waiting for 30 seconds before retrying..."); | |
await delay(30000); // Wait for a longer period on the main loop | |
} else { | |
break; // Exit on other errors | |
} | |
} | |
} | |
console.log(`Scraping complete. The data has been saved to ${fileName}.`); | |
} | |
const subredditToScrape = process.argv[2] || 'nodejs'; | |
if (subredditToScrape) { | |
scrapeSubreddit(subredditToScrape); | |
} else { | |
console.log("Please provide a subreddit name as a command-line argument. e.g., node scrape.js javascript"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment