Skip to content

Instantly share code, notes, and snippets.

@blob42
Forked from cheesinglee/reddit_scraper.py
Last active August 26, 2018 20:06
Show Gist options
  • Select an option

  • Save blob42/2df0354d5faf3725200bf968374c8f80 to your computer and use it in GitHub Desktop.

Select an option

Save blob42/2df0354d5faf3725200bf968374c8f80 to your computer and use it in GitHub Desktop.
Python subreddit scraper
# -*- coding: utf-8 -*-
"""
Created on 01 Mar 2017
@inspired from: cheesinglee
GistID: 2df0354d5faf3725200bf968374c8f80
"""
import praw
from csv import DictWriter
from datetime import datetime
from time import gmtime
import sys
SUBREDDITS = ['explorables']
POST_KEYS = ['url', 'title','created_utc','score','subreddit','domain','is_self','over_18','selftext', 'downs', 'ups']
AUTH_PARAMS = {
'client_id': 'z7Zp65dDvIXSQQ',
'client_secret': 'N1_H7kJDwkwu7jjkBuM5qXa8Yx8',
'password': 'spetsnaz42',
'username': 'explorables',
'user_agent': 'linux:explorables:0.1 (by /u/explorables)'
}
SCRAPE_AUTHORS = False
processed_users = {}
def get_author_info(a):
if a:
if a.id in processed_users:
return processed_users[a.id]
else:
d = {}
d['author_name'] = a.name
d['author_over_18'] = a.over_18
d['author_is_mod'] = a.is_mod
d['author_is_gold'] = a.is_gold
t = gmtime(a.created_utc)
d['author_created_year_utc'] = t.tm_year
d['author_created_mon_utc'] = t.tm_mon
d['author_created_day_of_year_utc'] = t.tm_yday
d['author_created_day_of_month_utc'] = t.tm_mday
d['author_created_day_of_week_utc'] = t.tm_wday
d['author_created_hour_utc'] = t.tm_hour
d['author_created_min_utc'] = t.tm_min
d['author_created_sec_utc'] = t.tm_sec
processed_users[a.id] = d
return d
else:
return {'author_name':'',
'author_over_18':None,
'author_is_mod':None,
'author_is_gold':None,
'author_created_year_utc':None,
'author_created_mon_utc':None,
'author_created_day_of_year_utc':None,
'author_created_day_of_month_utc':None,
'author_created_day_of_week_utc':None,
'author_created_hour_utc':None,
'author_created_min_utc':None,
'author_created_sec_utc':None}
def process_post(post):
d = {}
postdict = vars(post)
for key in POST_KEYS:
val = postdict[key]
try:
val = val.lower()
except:
pass
d[key] = val
d['has_thumbnail'] = (post.thumbnail != 'default') and (post.thumbnail != 'self')
if d['has_thumbnail']:
d['image_url'] = post.preview['images'][0]['source']['url']
post.comments.replace_more(limit=0)
comments = post.comments.list()
d['n_comments'] = len(list(comments))
d['comments'] = list(map(lambda x: x.body, comments))
if SCRAPE_AUTHORS:
author_dict = get_author_info(post.author)
for key,val in author_dict.items():
d[key] = val
return d
def write_to_file(csv_writer, post):
csv_writer.writerow(post)
fid.flush()
if __name__ == '__main__':
r = praw.Reddit(**AUTH_PARAMS)
# posts = {post_id: post_content}
posts = {}
_header_written = False
csv_writer = None
if len(SUBREDDITS) > 0:
filename = 'reddit_'+ '+'.join(SUBREDDITS) + '_' + datetime.now().isoformat() + '.csv'
with open(filename,'w') as fid:
# csv_writer.writerows(posts)
for subreddit in SUBREDDITS:
print('scraping subreddit:',subreddit)
sub = r.subreddit(subreddit)
print('scraping new posts...')
# posts = [process_post(p) for p in sub.get_new(limit=1000)]
# ids = [p['id'] for p in posts]
for post in sub.new():
if post.id not in posts:
print(post.title)
posts[post.id] = process_post(post)
if not _header_written:
csv_writer = DictWriter(fid,list(posts[post.id].keys()))
csv_writer.writeheader()
_header_written = True
write_to_file(csv_writer, posts[post.id])
print('scraping top posts...')
for post in sub.top('all'):
if post.id not in posts:
print(post.title)
posts[post.id] = process_post(post)
write_to_file(csv_writer, posts[post.id])
print('scraping controversial posts...')
for post in sub.controversial('all'):
if post.id not in posts:
print(post.title)
posts[post.id] = process_post(post)
write_to_file(csv_writer, posts[post.id])
else:
print('Choose a subreddit ...')
sys.exit(0)
# SUBREDDITS = ['frontpage']
# for post in r.get_front_page(limit=1000):
# print(post.title)
# posts.append(process_post(post))
print('scraped ',len(posts),' posts')
@shelvacu
Copy link
Copy Markdown

shelvacu commented Jan 5, 2018

You should probably remove the password and api keys...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment