Skip to content

Instantly share code, notes, and snippets.

@m-sean
Last active March 30, 2019 16:04
Show Gist options
  • Save m-sean/21ee4e27aa6800002df4e9e1aa0354ef to your computer and use it in GitHub Desktop.
Save m-sean/21ee4e27aa6800002df4e9e1aa0354ef to your computer and use it in GitHub Desktop.
Downloads tweets containing a user-specified query.
### Downloads tweets containing a user-specified query.
### The data is converted to an indexable of (JSON) tweets and written to disk.
import TwitterAPI
import yaml
import json
# User parameters.
TOTAL_COUNT=15000 # Total number of tweets to scrape (generally works up to ~15000)
COUNT=200 # Twitter API allows 200 per scrape
CREDENTIALS = "credentials.yaml" # YAML file containing developer credentials
SINK = "data.json" # Output JSON file
QUERY = "search query" # Search term for scrape (note: further cleaning may be necessary
# since the term may not be in the actual text of every tweet)
def main():
# Creates API connection.
with open(CREDENTIALS, "r") as source:
credentials = yaml.load(source)
api = TwitterAPI.TwitterAPI(
consumer_key=credentials["app_key"],
consumer_secret=credentials["app_secret"],
access_token_key=credentials["oauth_token"],
access_token_secret=credentials["oauth_token_secret"],
)
# Downloads data.
# First download.
kwargs = dict(
q=QUERY,
lang="en",
count=COUNT,
tweet_mode="extended",
)
data = []
data.extend(api.request('search/tweets',kwargs))
# This is the lowest ID for the batch, since they are given in reverse
# chronological order. Since ID filtering is inclusive, we subtract one.
max_id = data[-1]["id"] - 1
while len(data) < TOTAL_COUNT:
kwargs["max_id"]=max_id
batch = []
batch.extend(api.request('search/tweets',kwargs))
if not batch: #no tweets returned
break
max_id = batch[-1]["id"] - 1
data.extend(batch)
#writes all tweets to disk
print(f"{len(data)} tweets obtained")
with open(SINK, "w") as sink:
json.dump(data,sink)
print(f"Data written to {SINK}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment