emallson · November 20, 2015 01:55
diff --git a/collect-timeline.py b/collect-timeline.py
 from twython import Twython, TwythonRateLimitError, TwythonError
 from glob import glob
 from util import sleep_until
 from csv import DictReader, DictWriter
 import os

 APP_KEY = ''
 ACCESS_TOKEN = ''

 tw = Twython(APP_KEY, access_token=ACCESS_TOKEN)
 fields = ['id', 'user', 'created_at', 'lang', 'text']


 def timeline(user_id, max_id=None):
  """ Loop over a user's timeline, starting at max_id. 
  Generator.
  
  We can get up to 15 pages. 
  This function loops up to 16 times to make the base case 
  `len(tweets) == 0` trigger."""
    for i in range(16):
        tweets = tw.get_user_timeline(user_id=user_id,
                                      max_id=max_id,
                                      count=200,
                                      trim_user=True,
                                      exclude_replies=True,
                                      include_rts=False)
        if len(tweets) > 0:  # last page should have zero results
            for tweet in tweets:
                max_id = tweet['id'] - 1
                yield tweet
        else:
            break


 # I have a directory of csv files containing user profiles. 
 # Pull the IDs from these.
 for csv in glob("csvs/*.csv"):
    with open(csv, "r") as csvfile:
        reader = DictReader(csvfile)
        for row in reader:
            path = "csvs/timelines/{}.csv".format(row['id'])
            if os.path.exists(path):
                continue # skip users that have already been read.
            with open(path, "w") as timecsv:
                print(row['id'])
                writer = DictWriter(timecsv, fields)
                writer.writeheader()
                # max_id is used to continue a user's timeline from 
                # the last tweet read in the event that we get 
                # rate-limited in the middle of a user's timeline
                max_id = None
                while True:
                    try:
                        for tweet in timeline(row['id'], max_id):
                            sub = {k: v for k, v in tweet.items()
                                   if k in fields}
                            sub['user'] = sub['user']['id']
                            max_id = sub['id']
                            writer.writerow(sub)
                        max_id = None
                        break
                    except TwythonRateLimitError as e:
                        sleep_until(e.retry_after) # sleep until the given date
                    except TwythonError as e:
                        # I *think* this is caused by protected profiles. 
                        # I can read some user profile info, but not the timeline
                        print(e)
                        print("Skipping...")
                        break
diff --git a/util.py b/util.py
 from datetime import datetime
 import time

 def sleep_until(ts):
  """ Sleep until the given UTC UNIX TIMESTAMP. """
    next_time = datetime.utcfromtimestamp(int(ts))
    now = datetime.utcnow()
    offset = (next_time - now).seconds
    print("Enhancing calm. Next try: {} (Currently {})".format(next_time, now))
    print("Sleeping for {}...".format(offset))
    time.sleep(offset)
    print("Continuing...")
	from twython import Twython, TwythonRateLimitError, TwythonError
	from glob import glob
	from util import sleep_until
	from csv import DictReader, DictWriter
	import os

	APP_KEY = ''
	ACCESS_TOKEN = ''

	tw = Twython(APP_KEY, access_token=ACCESS_TOKEN)
	fields = ['id', 'user', 'created_at', 'lang', 'text']


	def timeline(user_id, max_id=None):
	""" Loop over a user's timeline, starting at max_id.
	Generator.

	We can get up to 15 pages.
	This function loops up to 16 times to make the base case
	`len(tweets) == 0` trigger."""
	for i in range(16):
	tweets = tw.get_user_timeline(user_id=user_id,
	max_id=max_id,
	count=200,
	trim_user=True,
	exclude_replies=True,
	include_rts=False)
	if len(tweets) > 0: # last page should have zero results
	for tweet in tweets:
	max_id = tweet['id'] - 1
	yield tweet
	else:
	break


	# I have a directory of csv files containing user profiles.
	# Pull the IDs from these.
	for csv in glob("csvs/*.csv"):
	with open(csv, "r") as csvfile:
	reader = DictReader(csvfile)
	for row in reader:
	path = "csvs/timelines/{}.csv".format(row['id'])
	if os.path.exists(path):
	continue # skip users that have already been read.
	with open(path, "w") as timecsv:
	print(row['id'])
	writer = DictWriter(timecsv, fields)
	writer.writeheader()
	# max_id is used to continue a user's timeline from
	# the last tweet read in the event that we get
	# rate-limited in the middle of a user's timeline
	max_id = None
	while True:
	try:
	for tweet in timeline(row['id'], max_id):
	sub = {k: v for k, v in tweet.items()
	if k in fields}
	sub['user'] = sub['user']['id']
	max_id = sub['id']
	writer.writerow(sub)
	max_id = None
	break
	except TwythonRateLimitError as e:
	sleep_until(e.retry_after) # sleep until the given date
	except TwythonError as e:
	# I think this is caused by protected profiles.
	# I can read some user profile info, but not the timeline
	print(e)
	print("Skipping...")
	break
	from datetime import datetime
	import time

	def sleep_until(ts):
	""" Sleep until the given UTC UNIX TIMESTAMP. """
	next_time = datetime.utcfromtimestamp(int(ts))
	now = datetime.utcnow()
	offset = (next_time - now).seconds
	print("Enhancing calm. Next try: {} (Currently {})".format(next_time, now))
	print("Sleeping for {}...".format(offset))
	time.sleep(offset)
	print("Continuing...")