Last active
January 31, 2021 21:51
-
-
Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "reserved-journalism", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import re\n", | |
"import tweepy\n", | |
"import csv\n", | |
"import pandas as pd\n", | |
"import time\n", | |
"import collections\n", | |
"\n", | |
"consumer_key = ''\n", | |
"consumer_secret = ''\n", | |
"\n", | |
"access_token = ''\n", | |
"access_token_secret = ''\n", | |
"\n", | |
"\n", | |
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", | |
"auth.set_access_token(access_token, access_token_secret)\n", | |
"api = tweepy.API(auth,wait_on_rate_limit=True)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "suffering-capitol", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"hashtag = \"#hashtag\"\n", | |
"lim = 100\n", | |
"\n", | |
"steps = lim\n", | |
"\n", | |
"current_time = time.ctime().replace(\" \",\"_\")\n", | |
"dir_name = f\"./{hashtag[1:]}\"\n", | |
"path = f\"./{hashtag[1:]}/{current_time}\"\n", | |
"outfile = f\"{path}/out.csv\"\n", | |
"outusers = f\"{path}/users.csv\"\n", | |
"\n", | |
"\n", | |
"try:\n", | |
" os.mkdir(f\"./{hashtag[1:]}\")\n", | |
"except FileExistsError:\n", | |
" print(\"directory exists\")\n", | |
"finally:\n", | |
" os.mkdir(path)\n", | |
" \n", | |
" \n", | |
"csv_file = open(outfile, 'a')\n", | |
"csv_writer_tweets = csv.writer(csv_file)\n", | |
"csv_writer_tweets.writerow(\n", | |
" ['created_at', 'id', 'id_str', 'user_id', 'screen name', 'account created at', 'text', 'truncated',\n", | |
" 'in_reply_to_status_id', 'in_reply_to_status_id_str', \n", | |
" 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', \n", | |
" 'geo', 'coordinates', 'place', 'contributors', \n", | |
" 'is_quote_status', 'retweet_count', 'favorite_count', \n", | |
" 'favorited', 'retweeted', 'lang'])\n", | |
"\n", | |
"csv_users = open(outusers, 'a')\n", | |
"csv_writer_users = csv.writer(csv_users)\n", | |
"csv_writer_users.writerow(\n", | |
" ['id', 'id_str', 'name', 'screen_name', 'location', 'description', \n", | |
" 'url','protected', 'followers_count', 'friends_count', \n", | |
" 'listed_count', 'created_at', 'favourites_count', 'utc_offset', \n", | |
" 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', \n", | |
" 'contributors_enabled', 'is_translator', 'is_translation_enabled', \n", | |
" 'has_extended_profile', 'default_profile', 'default_profile_image', \n", | |
" 'following', 'follow_request_sent', 'notifications', 'translator_type'])\n", | |
"\n", | |
"\n", | |
"\n", | |
"print(time.ctime())\n", | |
"for i, tweet in enumerate(tweepy.Cursor(api.search,q=hashtag,count=lim).items()):\n", | |
" if not tweet.user.id in users: \n", | |
" csv_writer_users.writerow([\n", | |
" tweet.user._json['id'], tweet.user._json['id_str'], tweet.user._json['name'],\n", | |
" tweet.user._json['screen_name'], tweet.user._json['location'],\n", | |
" tweet.user._json['description'], tweet.user._json['url'], \n", | |
" tweet.user._json['protected'], tweet.user._json['followers_count'], \n", | |
" tweet.user._json['friends_count'], tweet.user._json['listed_count'],\n", | |
" tweet.user._json['created_at'], tweet.user._json['favourites_count'], \n", | |
" tweet.user._json['utc_offset'], tweet.user._json['time_zone'], \n", | |
" tweet.user._json['geo_enabled'], tweet.user._json['verified'], \n", | |
" tweet.user._json['statuses_count'], tweet.user._json['lang'], \n", | |
" tweet.user._json['contributors_enabled'], tweet.user._json['is_translator'], \n", | |
" tweet.user._json['is_translation_enabled'], tweet.user._json['has_extended_profile'],\n", | |
" tweet.user._json['default_profile'], tweet.user._json['default_profile_image'], \n", | |
" tweet.user._json['following'], tweet.user._json['follow_request_sent'], \n", | |
" tweet.user._json['notifications'], tweet.user._json['translator_type']\n", | |
" ])\n", | |
" \n", | |
" csv_writer_tweets.writerow([\n", | |
" tweet._json['created_at'], tweet._json['id'], tweet._json['id_str'], \n", | |
" tweet.user._json['id'], tweet.user._json['screen_name'], str(tweet.user.created_at),\n", | |
" tweet._json['text'], tweet._json['truncated'], tweet._json['in_reply_to_status_id'],\n", | |
" tweet._json['in_reply_to_status_id_str'], tweet._json['in_reply_to_user_id'],\n", | |
" tweet._json['in_reply_to_user_id_str'], tweet._json['in_reply_to_screen_name'], \n", | |
" tweet._json['geo'], tweet._json['coordinates'], tweet._json['place'],\n", | |
" tweet._json['contributors'], tweet._json['is_quote_status'], \n", | |
" tweet._json['retweet_count'], tweet._json['favorite_count'], \n", | |
" tweet._json['favorited'], tweet._json['retweeted'], tweet._json['lang']])\n", | |
" if i > lim:\n", | |
" print(f\"{i:8<}\", flush=True, end=\" \")\n", | |
" lim += steps" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "enabling-manner", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_tweets = pd.read_csv(outfile)\n", | |
"df_users = pd.read_csv(outusers)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "undefined-documentary", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = df_tweets\n", | |
"df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n", | |
"\n", | |
"x = df[df[\"RT\"]==False][\"text\"].map(\n", | |
" lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n", | |
" lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n", | |
" lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n", | |
" lambda x:re.sub(\"\\s\", \"\", x)).map(\n", | |
" lambda x: x[:20])\n", | |
"\n", | |
"y = sorted(set(x))\n", | |
"\n", | |
"print(f\"uniq {len(y)}\\noriginal {len(df[df['RT']==False])}\\nall {len(df['RT'])}\")\n", | |
"print(\"accounts \", len(set(df[\"screen name\"])))\n", | |
"\n", | |
"df.tail(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "rubber-operation", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n", | |
"tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n", | |
"\n", | |
"\n", | |
"print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n", | |
"\n", | |
"for k in sorted(account_number.keys()):\n", | |
" print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "frequent-immunology", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "sticky-expression", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "infinite-shoot", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment