avnersorek · August 20, 2025 08:17
diff --git a/scrape_subreddit.js b/scrape_subreddit.js
 /**
 The script is designed to be respectful of Reddit's API limits while comprehensively collecting both posts 
 and comments for offline analysis or archival purposes.

 I have used it to scrape subreddits and throw the text in NotebookLLM.
 Takes like 30 minutes to download a subreddit (depending on how busy it is) since there are a lot of rate limits. 

 1. Scrapes recent posts from a specified subreddit (last 30 days) using Reddit's JSON API
 2. Fetches comments for each post along with the post content
 3. Saves everything to a text file named {subreddit}_posts.txt with formatted content including:
 •  Post title, author, URL, score, and text
 •  All comments with their authors
 4. Handles rate limiting with delays (2 seconds between pages, 10-30 seconds on rate limit errors)
 5. Paginates through results to get all posts from the past 30 days
 6. Takes subreddit name as a command-line argument (defaults to 'nodejs' if none provided)
 **/

 const axios = require('axios');
 const fs = require('fs');
 const path = require('path');

 // Function to calculate the timestamp for 30 days ago
 function getThirtyDaysAgoTimestamp() {
    const d = new Date();
    d.setDate(d.getDate() - 30);
    return d.getTime() / 1000;
 }

 // A simple delay function
 const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));

 // Function to fetch all comments for a post
 async function getComments(permalink) {
    const commentsUrl = `https://www.reddit.com${permalink}.json`;
    try {
        const response = await axios.get(commentsUrl, {
            headers: {
                'User-Agent': 'MyRedditScraper/1.0'
            }
        });
        const commentsData = response.data[1].data.children;
        let comments = '';

        if (commentsData && commentsData.length > 0) {
            commentsData.forEach(comment => {
                if (comment.kind === 't1') {
                    comments += `  Comment by u/${comment.data.author}:\n`;
                    comments += `  ${comment.data.body}\n\n`;
                }
            });
        }
        return comments;
    } catch (error) {
        console.error(`Error fetching comments for ${permalink}:`, error.message);
        // Add a delay on error to avoid further rate-limiting
        if (error.response && error.response.status === 429) {
            console.log("Rate limit hit. Waiting for 10 seconds before retrying...");
            await delay(10000); // Wait for 10 seconds
            return getComments(permalink); // Retry the request
        }
        return '';
    }
 }

 // Main function to scrape the subreddit
 async function scrapeSubreddit(subredditName) {
    const thirtyDaysAgo = getThirtyDaysAgoTimestamp();
    const fileName = path.join(__dirname, `${subredditName}_posts.txt`);
    
    // Clear the file if it already exists
    if (fs.existsSync(fileName)) {
        fs.unlinkSync(fileName);
    }
    
    let after = null;
    let keepFetching = true;

    console.log(`Scraping r/${subredditName} and writing to ${fileName}...`);
    
    while (keepFetching) {
        let url = `https://www.reddit.com/r/${subredditName}/new.json?limit=100`;
        if (after) {
            url += `&after=${after}`;
        }

        try {
            const response = await axios.get(url, {
                headers: {
                    'User-Agent': 'MyRedditScraper/1.0'
                }
            });
            
            const posts = response.data.data.children;

            if (posts.length === 0) {
                break; // No more posts to fetch
            }

            for (const post of posts) {
                const postData = post.data;
                
                if (postData.created_utc < thirtyDaysAgo) {
                    keepFetching = false;
                    break;
                }
                
                const commentsContent = await getComments(postData.permalink);
                
                const content = `--- Post Title: ${postData.title} ---\n` +
                                `Author: ${postData.author}\n` +
                                `URL: ${postData.url}\n` +
                                `Score: ${postData.score}\n` +
                                `Text: ${postData.selftext}\n\n` +
                                `--- Comments ---\n` +
                                commentsContent +
                                `\n${'='.repeat(50)}\n\n`;

                fs.appendFileSync(fileName, content, 'utf-8');
            }

            after = response.data.data.after;
            
            // Add a short delay between pages to be more respectful of the API rate limits
            console.log("Page scraped. Waiting for 2 seconds before fetching the next page...");
            await delay(2000);

        } catch (error) {
            console.error(`An error occurred while scraping:`, error.message);
            if (error.response && error.response.status === 429) {
                console.log("Rate limit hit on main loop. Waiting for 30 seconds before retrying...");
                await delay(30000); // Wait for a longer period on the main loop
            } else {
                break; // Exit on other errors
            }
        }
    }
    
    console.log(`Scraping complete. The data has been saved to ${fileName}.`);
 }

 const subredditToScrape = process.argv[2] || 'nodejs';

 if (subredditToScrape) {
    scrapeSubreddit(subredditToScrape);
 } else {
    console.log("Please provide a subreddit name as a command-line argument. e.g., node scrape.js javascript");
 }
	/**
	The script is designed to be respectful of Reddit's API limits while comprehensively collecting both posts
	and comments for offline analysis or archival purposes.

	I have used it to scrape subreddits and throw the text in NotebookLLM.
	Takes like 30 minutes to download a subreddit (depending on how busy it is) since there are a lot of rate limits.

	1. Scrapes recent posts from a specified subreddit (last 30 days) using Reddit's JSON API
	2. Fetches comments for each post along with the post content
	3. Saves everything to a text file named {subreddit}_posts.txt with formatted content including:
	• Post title, author, URL, score, and text
	• All comments with their authors
	4. Handles rate limiting with delays (2 seconds between pages, 10-30 seconds on rate limit errors)
	5. Paginates through results to get all posts from the past 30 days
	6. Takes subreddit name as a command-line argument (defaults to 'nodejs' if none provided)
	**/

	const axios = require('axios');
	const fs = require('fs');
	const path = require('path');

	// Function to calculate the timestamp for 30 days ago
	function getThirtyDaysAgoTimestamp() {
	const d = new Date();
	d.setDate(d.getDate() - 30);
	return d.getTime() / 1000;
	}

	// A simple delay function
	const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));

	// Function to fetch all comments for a post
	async function getComments(permalink) {
	const commentsUrl = `https://www.reddit.com${permalink}.json`;
	try {
	const response = await axios.get(commentsUrl, {
	headers: {
	'User-Agent': 'MyRedditScraper/1.0'
	}
	});
	const commentsData = response.data[1].data.children;
	let comments = '';

	if (commentsData && commentsData.length > 0) {
	commentsData.forEach(comment => {
	if (comment.kind === 't1') {
	comments += ` Comment by u/${comment.data.author}:\n`;
	comments += ` ${comment.data.body}\n\n`;
	}
	});
	}
	return comments;
	} catch (error) {
	console.error(`Error fetching comments for ${permalink}:`, error.message);
	// Add a delay on error to avoid further rate-limiting
	if (error.response && error.response.status === 429) {
	console.log("Rate limit hit. Waiting for 10 seconds before retrying...");
	await delay(10000); // Wait for 10 seconds
	return getComments(permalink); // Retry the request
	}
	return '';
	}
	}

	// Main function to scrape the subreddit
	async function scrapeSubreddit(subredditName) {
	const thirtyDaysAgo = getThirtyDaysAgoTimestamp();
	const fileName = path.join(__dirname, `${subredditName}_posts.txt`);

	// Clear the file if it already exists
	if (fs.existsSync(fileName)) {
	fs.unlinkSync(fileName);
	}

	let after = null;
	let keepFetching = true;

	console.log(`Scraping r/${subredditName} and writing to ${fileName}...`);

	while (keepFetching) {
	let url = `https://www.reddit.com/r/${subredditName}/new.json?limit=100`;
	if (after) {
	url += `&after=${after}`;
	}

	try {
	const response = await axios.get(url, {
	headers: {
	'User-Agent': 'MyRedditScraper/1.0'
	}
	});

	const posts = response.data.data.children;

	if (posts.length === 0) {
	break; // No more posts to fetch
	}

	for (const post of posts) {
	const postData = post.data;

	if (postData.created_utc < thirtyDaysAgo) {
	keepFetching = false;
	break;
	}

	const commentsContent = await getComments(postData.permalink);

	const content = `--- Post Title: ${postData.title} ---\n` +
	`Author: ${postData.author}\n` +
	`URL: ${postData.url}\n` +
	`Score: ${postData.score}\n` +
	`Text: ${postData.selftext}\n\n` +
	`--- Comments ---\n` +
	commentsContent +
	`\n${'='.repeat(50)}\n\n`;

	fs.appendFileSync(fileName, content, 'utf-8');
	}

	after = response.data.data.after;

	// Add a short delay between pages to be more respectful of the API rate limits
	console.log("Page scraped. Waiting for 2 seconds before fetching the next page...");
	await delay(2000);

	} catch (error) {
	console.error(`An error occurred while scraping:`, error.message);
	if (error.response && error.response.status === 429) {
	console.log("Rate limit hit on main loop. Waiting for 30 seconds before retrying...");
	await delay(30000); // Wait for a longer period on the main loop
	} else {
	break; // Exit on other errors
	}
	}
	}

	console.log(`Scraping complete. The data has been saved to ${fileName}.`);
	}

	const subredditToScrape = process.argv[2] \|\| 'nodejs';

	if (subredditToScrape) {
	scrapeSubreddit(subredditToScrape);
	} else {
	console.log("Please provide a subreddit name as a command-line argument. e.g., node scrape.js javascript");
	}