blob42 · August 26, 2018 20:06 · shelvacu · Jan 5, 2018
diff --git a/reddit_scraper.py b/reddit_scraper.py

 # -*- coding: utf-8 -*-
 """
 Created on 01 Mar 2017

 @inspired from: cheesinglee
 GistID: 2df0354d5faf3725200bf968374c8f80
 """

 import praw

 from csv import DictWriter
 from datetime import datetime
 from time import gmtime
 import sys

 SUBREDDITS = ['explorables']

 POST_KEYS = ['url', 'title','created_utc','score','subreddit','domain','is_self','over_18','selftext', 'downs', 'ups']
 AUTH_PARAMS = {
    'client_id': 'z7Zp65dDvIXSQQ',
    'client_secret': 'N1_H7kJDwkwu7jjkBuM5qXa8Yx8',
    'password': 'spetsnaz42',
    'username': 'explorables',
    'user_agent': 'linux:explorables:0.1 (by /u/explorables)'
 }
 SCRAPE_AUTHORS = False

 processed_users = {}

 def get_author_info(a):
    if a:
        if a.id in processed_users:
            return processed_users[a.id]
        else:
            d = {}
            d['author_name'] = a.name
            d['author_over_18'] = a.over_18
            d['author_is_mod'] = a.is_mod
            d['author_is_gold'] = a.is_gold
            t = gmtime(a.created_utc)
            d['author_created_year_utc'] = t.tm_year
            d['author_created_mon_utc'] = t.tm_mon
            d['author_created_day_of_year_utc'] = t.tm_yday
            d['author_created_day_of_month_utc'] = t.tm_mday
            d['author_created_day_of_week_utc'] = t.tm_wday
            d['author_created_hour_utc'] = t.tm_hour
            d['author_created_min_utc'] = t.tm_min
            d['author_created_sec_utc'] = t.tm_sec
            processed_users[a.id] = d
            return d
    else:
        return {'author_name':'',
                'author_over_18':None,
                'author_is_mod':None,
                'author_is_gold':None,
                'author_created_year_utc':None,
                'author_created_mon_utc':None,
                'author_created_day_of_year_utc':None,
                'author_created_day_of_month_utc':None,
                'author_created_day_of_week_utc':None,
                'author_created_hour_utc':None,
                'author_created_min_utc':None,
                'author_created_sec_utc':None}

 def process_post(post):
    d = {}
    postdict = vars(post)
    for key in POST_KEYS:
        val = postdict[key]
        try:
            val = val.lower()
        except:
            pass
        d[key] = val

    d['has_thumbnail'] = (post.thumbnail != 'default') and (post.thumbnail != 'self')
    if d['has_thumbnail']:
        d['image_url'] = post.preview['images'][0]['source']['url']

    post.comments.replace_more(limit=0)
    comments = post.comments.list()
    d['n_comments'] = len(list(comments))
    d['comments'] = list(map(lambda x: x.body, comments))

    if SCRAPE_AUTHORS:
        author_dict = get_author_info(post.author)
        for key,val in author_dict.items():
            d[key] = val
    return d


 def write_to_file(csv_writer, post):
        csv_writer.writerow(post)
        fid.flush()

 if __name__ == '__main__':
    r = praw.Reddit(**AUTH_PARAMS)

    # posts = {post_id: post_content}
    posts = {}
    _header_written = False
    csv_writer = None

    if len(SUBREDDITS) > 0:

        filename = 'reddit_'+ '+'.join(SUBREDDITS) + '_' + datetime.now().isoformat() + '.csv'

        with open(filename,'w') as fid:
            # csv_writer.writerows(posts)

            for subreddit in SUBREDDITS:
                print('scraping subreddit:',subreddit)
                sub = r.subreddit(subreddit)

                print('scraping new posts...')
            #    posts =  [process_post(p) for p in sub.get_new(limit=1000)]
            #    ids = [p['id'] for p in posts]
                for post in sub.new():
                    if post.id not in posts:
                        print(post.title)
                        posts[post.id] = process_post(post)
                        if not _header_written:
                            csv_writer = DictWriter(fid,list(posts[post.id].keys()))
                            csv_writer.writeheader()
                            _header_written = True
                        write_to_file(csv_writer, posts[post.id])

                print('scraping top posts...')
                for post in sub.top('all'):
                    if post.id not in posts:
                        print(post.title)
                        posts[post.id] = process_post(post)
                        write_to_file(csv_writer, posts[post.id])

                print('scraping controversial posts...')
                for post in sub.controversial('all'):
                    if post.id not in posts:
                        print(post.title)
                        posts[post.id] = process_post(post)
                        write_to_file(csv_writer, posts[post.id])

    else:
        print('Choose a subreddit ...')
        sys.exit(0)
        # SUBREDDITS = ['frontpage']
        # for post in r.get_front_page(limit=1000):
            # print(post.title)
            # posts.append(process_post(post))

    print('scraped ',len(posts),' posts')

	# -- coding: utf-8 --
	"""
	Created on 01 Mar 2017

	@inspired from: cheesinglee
	GistID: 2df0354d5faf3725200bf968374c8f80
	"""

	import praw

	from csv import DictWriter
	from datetime import datetime
	from time import gmtime
	import sys

	SUBREDDITS = ['explorables']

	POST_KEYS = ['url', 'title','created_utc','score','subreddit','domain','is_self','over_18','selftext', 'downs', 'ups']
	AUTH_PARAMS = {
	'client_id': 'z7Zp65dDvIXSQQ',
	'client_secret': 'N1_H7kJDwkwu7jjkBuM5qXa8Yx8',
	'password': 'spetsnaz42',
	'username': 'explorables',
	'user_agent': 'linux:explorables:0.1 (by /u/explorables)'
	}
	SCRAPE_AUTHORS = False

	processed_users = {}

	def get_author_info(a):
	if a:
	if a.id in processed_users:
	return processed_users[a.id]
	else:
	d = {}
	d['author_name'] = a.name
	d['author_over_18'] = a.over_18
	d['author_is_mod'] = a.is_mod
	d['author_is_gold'] = a.is_gold
	t = gmtime(a.created_utc)
	d['author_created_year_utc'] = t.tm_year
	d['author_created_mon_utc'] = t.tm_mon
	d['author_created_day_of_year_utc'] = t.tm_yday
	d['author_created_day_of_month_utc'] = t.tm_mday
	d['author_created_day_of_week_utc'] = t.tm_wday
	d['author_created_hour_utc'] = t.tm_hour
	d['author_created_min_utc'] = t.tm_min
	d['author_created_sec_utc'] = t.tm_sec
	processed_users[a.id] = d
	return d
	else:
	return {'author_name':'',
	'author_over_18':None,
	'author_is_mod':None,
	'author_is_gold':None,
	'author_created_year_utc':None,
	'author_created_mon_utc':None,
	'author_created_day_of_year_utc':None,
	'author_created_day_of_month_utc':None,
	'author_created_day_of_week_utc':None,
	'author_created_hour_utc':None,
	'author_created_min_utc':None,
	'author_created_sec_utc':None}

	def process_post(post):
	d = {}
	postdict = vars(post)
	for key in POST_KEYS:
	val = postdict[key]
	try:
	val = val.lower()
	except:
	pass
	d[key] = val

	d['has_thumbnail'] = (post.thumbnail != 'default') and (post.thumbnail != 'self')
	if d['has_thumbnail']:
	d['image_url'] = post.preview['images'][0]['source']['url']

	post.comments.replace_more(limit=0)
	comments = post.comments.list()
	d['n_comments'] = len(list(comments))
	d['comments'] = list(map(lambda x: x.body, comments))

	if SCRAPE_AUTHORS:
	author_dict = get_author_info(post.author)
	for key,val in author_dict.items():
	d[key] = val
	return d


	def write_to_file(csv_writer, post):
	csv_writer.writerow(post)
	fid.flush()

	if __name__ == '__main__':
	r = praw.Reddit(**AUTH_PARAMS)

	# posts = {post_id: post_content}
	posts = {}
	_header_written = False
	csv_writer = None

	if len(SUBREDDITS) > 0:

	filename = 'reddit_'+ '+'.join(SUBREDDITS) + '_' + datetime.now().isoformat() + '.csv'

	with open(filename,'w') as fid:
	# csv_writer.writerows(posts)

	for subreddit in SUBREDDITS:
	print('scraping subreddit:',subreddit)
	sub = r.subreddit(subreddit)

	print('scraping new posts...')
	# posts = [process_post(p) for p in sub.get_new(limit=1000)]
	# ids = [p['id'] for p in posts]
	for post in sub.new():
	if post.id not in posts:
	print(post.title)
	posts[post.id] = process_post(post)
	if not _header_written:
	csv_writer = DictWriter(fid,list(posts[post.id].keys()))
	csv_writer.writeheader()
	_header_written = True
	write_to_file(csv_writer, posts[post.id])

	print('scraping top posts...')
	for post in sub.top('all'):
	if post.id not in posts:
	print(post.title)
	posts[post.id] = process_post(post)
	write_to_file(csv_writer, posts[post.id])

	print('scraping controversial posts...')
	for post in sub.controversial('all'):
	if post.id not in posts:
	print(post.title)
	posts[post.id] = process_post(post)
	write_to_file(csv_writer, posts[post.id])

	else:
	print('Choose a subreddit ...')
	sys.exit(0)
	# SUBREDDITS = ['frontpage']
	# for post in r.get_front_page(limit=1000):
	# print(post.title)
	# posts.append(process_post(post))

	print('scraped ',len(posts),' posts')
No results found