Last active
March 30, 2019 16:04
-
-
Save m-sean/21ee4e27aa6800002df4e9e1aa0354ef to your computer and use it in GitHub Desktop.
Downloads tweets containing a user-specified query.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Downloads tweets containing a user-specified query. | |
### The data is converted to an indexable of (JSON) tweets and written to disk. | |
import TwitterAPI | |
import yaml | |
import json | |
# User parameters. | |
TOTAL_COUNT=15000 # Total number of tweets to scrape (generally works up to ~15000) | |
COUNT=200 # Twitter API allows 200 per scrape | |
CREDENTIALS = "credentials.yaml" # YAML file containing developer credentials | |
SINK = "data.json" # Output JSON file | |
QUERY = "search query" # Search term for scrape (note: further cleaning may be necessary | |
# since the term may not be in the actual text of every tweet) | |
def main(): | |
# Creates API connection. | |
with open(CREDENTIALS, "r") as source: | |
credentials = yaml.load(source) | |
api = TwitterAPI.TwitterAPI( | |
consumer_key=credentials["app_key"], | |
consumer_secret=credentials["app_secret"], | |
access_token_key=credentials["oauth_token"], | |
access_token_secret=credentials["oauth_token_secret"], | |
) | |
# Downloads data. | |
# First download. | |
kwargs = dict( | |
q=QUERY, | |
lang="en", | |
count=COUNT, | |
tweet_mode="extended", | |
) | |
data = [] | |
data.extend(api.request('search/tweets',kwargs)) | |
# This is the lowest ID for the batch, since they are given in reverse | |
# chronological order. Since ID filtering is inclusive, we subtract one. | |
max_id = data[-1]["id"] - 1 | |
while len(data) < TOTAL_COUNT: | |
kwargs["max_id"]=max_id | |
batch = [] | |
batch.extend(api.request('search/tweets',kwargs)) | |
if not batch: #no tweets returned | |
break | |
max_id = batch[-1]["id"] - 1 | |
data.extend(batch) | |
#writes all tweets to disk | |
print(f"{len(data)} tweets obtained") | |
with open(SINK, "w") as sink: | |
json.dump(data,sink) | |
print(f"Data written to {SINK}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment