-
-
Save blob42/2df0354d5faf3725200bf968374c8f80 to your computer and use it in GitHub Desktop.
Python subreddit scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on 01 Mar 2017 | |
| @inspired from: cheesinglee | |
| GistID: 2df0354d5faf3725200bf968374c8f80 | |
| """ | |
| import praw | |
| from csv import DictWriter | |
| from datetime import datetime | |
| from time import gmtime | |
| import sys | |
| SUBREDDITS = ['explorables'] | |
| POST_KEYS = ['url', 'title','created_utc','score','subreddit','domain','is_self','over_18','selftext', 'downs', 'ups'] | |
| AUTH_PARAMS = { | |
| 'client_id': 'z7Zp65dDvIXSQQ', | |
| 'client_secret': 'N1_H7kJDwkwu7jjkBuM5qXa8Yx8', | |
| 'password': 'spetsnaz42', | |
| 'username': 'explorables', | |
| 'user_agent': 'linux:explorables:0.1 (by /u/explorables)' | |
| } | |
| SCRAPE_AUTHORS = False | |
| processed_users = {} | |
| def get_author_info(a): | |
| if a: | |
| if a.id in processed_users: | |
| return processed_users[a.id] | |
| else: | |
| d = {} | |
| d['author_name'] = a.name | |
| d['author_over_18'] = a.over_18 | |
| d['author_is_mod'] = a.is_mod | |
| d['author_is_gold'] = a.is_gold | |
| t = gmtime(a.created_utc) | |
| d['author_created_year_utc'] = t.tm_year | |
| d['author_created_mon_utc'] = t.tm_mon | |
| d['author_created_day_of_year_utc'] = t.tm_yday | |
| d['author_created_day_of_month_utc'] = t.tm_mday | |
| d['author_created_day_of_week_utc'] = t.tm_wday | |
| d['author_created_hour_utc'] = t.tm_hour | |
| d['author_created_min_utc'] = t.tm_min | |
| d['author_created_sec_utc'] = t.tm_sec | |
| processed_users[a.id] = d | |
| return d | |
| else: | |
| return {'author_name':'', | |
| 'author_over_18':None, | |
| 'author_is_mod':None, | |
| 'author_is_gold':None, | |
| 'author_created_year_utc':None, | |
| 'author_created_mon_utc':None, | |
| 'author_created_day_of_year_utc':None, | |
| 'author_created_day_of_month_utc':None, | |
| 'author_created_day_of_week_utc':None, | |
| 'author_created_hour_utc':None, | |
| 'author_created_min_utc':None, | |
| 'author_created_sec_utc':None} | |
| def process_post(post): | |
| d = {} | |
| postdict = vars(post) | |
| for key in POST_KEYS: | |
| val = postdict[key] | |
| try: | |
| val = val.lower() | |
| except: | |
| pass | |
| d[key] = val | |
| d['has_thumbnail'] = (post.thumbnail != 'default') and (post.thumbnail != 'self') | |
| if d['has_thumbnail']: | |
| d['image_url'] = post.preview['images'][0]['source']['url'] | |
| post.comments.replace_more(limit=0) | |
| comments = post.comments.list() | |
| d['n_comments'] = len(list(comments)) | |
| d['comments'] = list(map(lambda x: x.body, comments)) | |
| if SCRAPE_AUTHORS: | |
| author_dict = get_author_info(post.author) | |
| for key,val in author_dict.items(): | |
| d[key] = val | |
| return d | |
| def write_to_file(csv_writer, post): | |
| csv_writer.writerow(post) | |
| fid.flush() | |
| if __name__ == '__main__': | |
| r = praw.Reddit(**AUTH_PARAMS) | |
| # posts = {post_id: post_content} | |
| posts = {} | |
| _header_written = False | |
| csv_writer = None | |
| if len(SUBREDDITS) > 0: | |
| filename = 'reddit_'+ '+'.join(SUBREDDITS) + '_' + datetime.now().isoformat() + '.csv' | |
| with open(filename,'w') as fid: | |
| # csv_writer.writerows(posts) | |
| for subreddit in SUBREDDITS: | |
| print('scraping subreddit:',subreddit) | |
| sub = r.subreddit(subreddit) | |
| print('scraping new posts...') | |
| # posts = [process_post(p) for p in sub.get_new(limit=1000)] | |
| # ids = [p['id'] for p in posts] | |
| for post in sub.new(): | |
| if post.id not in posts: | |
| print(post.title) | |
| posts[post.id] = process_post(post) | |
| if not _header_written: | |
| csv_writer = DictWriter(fid,list(posts[post.id].keys())) | |
| csv_writer.writeheader() | |
| _header_written = True | |
| write_to_file(csv_writer, posts[post.id]) | |
| print('scraping top posts...') | |
| for post in sub.top('all'): | |
| if post.id not in posts: | |
| print(post.title) | |
| posts[post.id] = process_post(post) | |
| write_to_file(csv_writer, posts[post.id]) | |
| print('scraping controversial posts...') | |
| for post in sub.controversial('all'): | |
| if post.id not in posts: | |
| print(post.title) | |
| posts[post.id] = process_post(post) | |
| write_to_file(csv_writer, posts[post.id]) | |
| else: | |
| print('Choose a subreddit ...') | |
| sys.exit(0) | |
| # SUBREDDITS = ['frontpage'] | |
| # for post in r.get_front_page(limit=1000): | |
| # print(post.title) | |
| # posts.append(process_post(post)) | |
| print('scraped ',len(posts),' posts') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You should probably remove the password and api keys...