Created
November 20, 2015 01:55
-
-
Save emallson/9e56a99973b3091124cd to your computer and use it in GitHub Desktop.
Iterating over user timelines with Twython
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from twython import Twython, TwythonRateLimitError, TwythonError | |
from glob import glob | |
from util import sleep_until | |
from csv import DictReader, DictWriter | |
import os | |
APP_KEY = '' | |
ACCESS_TOKEN = '' | |
tw = Twython(APP_KEY, access_token=ACCESS_TOKEN) | |
fields = ['id', 'user', 'created_at', 'lang', 'text'] | |
def timeline(user_id, max_id=None): | |
""" Loop over a user's timeline, starting at max_id. | |
Generator. | |
We can get up to 15 pages. | |
This function loops up to 16 times to make the base case | |
`len(tweets) == 0` trigger.""" | |
for i in range(16): | |
tweets = tw.get_user_timeline(user_id=user_id, | |
max_id=max_id, | |
count=200, | |
trim_user=True, | |
exclude_replies=True, | |
include_rts=False) | |
if len(tweets) > 0: # last page should have zero results | |
for tweet in tweets: | |
max_id = tweet['id'] - 1 | |
yield tweet | |
else: | |
break | |
# I have a directory of csv files containing user profiles. | |
# Pull the IDs from these. | |
for csv in glob("csvs/*.csv"): | |
with open(csv, "r") as csvfile: | |
reader = DictReader(csvfile) | |
for row in reader: | |
path = "csvs/timelines/{}.csv".format(row['id']) | |
if os.path.exists(path): | |
continue # skip users that have already been read. | |
with open(path, "w") as timecsv: | |
print(row['id']) | |
writer = DictWriter(timecsv, fields) | |
writer.writeheader() | |
# max_id is used to continue a user's timeline from | |
# the last tweet read in the event that we get | |
# rate-limited in the middle of a user's timeline | |
max_id = None | |
while True: | |
try: | |
for tweet in timeline(row['id'], max_id): | |
sub = {k: v for k, v in tweet.items() | |
if k in fields} | |
sub['user'] = sub['user']['id'] | |
max_id = sub['id'] | |
writer.writerow(sub) | |
max_id = None | |
break | |
except TwythonRateLimitError as e: | |
sleep_until(e.retry_after) # sleep until the given date | |
except TwythonError as e: | |
# I *think* this is caused by protected profiles. | |
# I can read some user profile info, but not the timeline | |
print(e) | |
print("Skipping...") | |
break |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import time | |
def sleep_until(ts): | |
""" Sleep until the given UTC UNIX TIMESTAMP. """ | |
next_time = datetime.utcfromtimestamp(int(ts)) | |
now = datetime.utcnow() | |
offset = (next_time - now).seconds | |
print("Enhancing calm. Next try: {} (Currently {})".format(next_time, now)) | |
print("Sleeping for {}...".format(offset)) | |
time.sleep(offset) | |
print("Continuing...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment