iamvee · January 31, 2021 21:51
diff --git a/template.ipynb b/template.ipynb
diff --git a/tweets.ipynb b/tweets.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "reserved-journalism",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import tweepy\n",
    "import csv\n",
    "import pandas as pd\n",
    "import time\n",
    "import collections\n",
    "\n",
    "consumer_key = ''\n",
    "consumer_secret = ''\n",
    "\n",
    "access_token = ''\n",
    "access_token_secret = ''\n",
    "\n",
    "\n",
    "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
    "auth.set_access_token(access_token, access_token_secret)\n",
    "api = tweepy.API(auth,wait_on_rate_limit=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "suffering-capitol",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "hashtag = \"#hashtag\"\n",
    "lim = 100\n",
    "\n",
    "steps = lim\n",
    "\n",
    "current_time = time.ctime().replace(\" \",\"_\")\n",
    "dir_name = f\"./{hashtag[1:]}\"\n",
    "path = f\"./{hashtag[1:]}/{current_time}\"\n",
    "outfile = f\"{path}/out.csv\"\n",
    "outusers =  f\"{path}/users.csv\"\n",
    "\n",
    "\n",
    "try:\n",
    "    os.mkdir(f\"./{hashtag[1:]}\")\n",
    "except FileExistsError:\n",
    "    print(\"directory exists\")\n",
    "finally:\n",
    "    os.mkdir(path)\n",
    "    \n",
    "    \n",
    "csv_file = open(outfile, 'a')\n",
    "csv_writer_tweets = csv.writer(csv_file)\n",
    "csv_writer_tweets.writerow(\n",
    "    ['created_at', 'id', 'id_str', 'user_id', 'screen name', 'account created at', 'text', 'truncated',\n",
    "     'in_reply_to_status_id', 'in_reply_to_status_id_str', \n",
    "     'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', \n",
    "     'geo', 'coordinates', 'place', 'contributors', \n",
    "     'is_quote_status', 'retweet_count', 'favorite_count', \n",
    "     'favorited', 'retweeted', 'lang'])\n",
    "\n",
    "csv_users = open(outusers, 'a')\n",
    "csv_writer_users = csv.writer(csv_users)\n",
    "csv_writer_users.writerow(\n",
    "    ['id', 'id_str', 'name', 'screen_name', 'location', 'description', \n",
    "     'url','protected', 'followers_count', 'friends_count', \n",
    "     'listed_count', 'created_at', 'favourites_count', 'utc_offset', \n",
    "     'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', \n",
    "     'contributors_enabled', 'is_translator', 'is_translation_enabled', \n",
    "     'has_extended_profile', 'default_profile', 'default_profile_image', \n",
    "     'following', 'follow_request_sent', 'notifications', 'translator_type'])\n",
    "\n",
    "\n",
    "\n",
    "print(time.ctime())\n",
    "for i, tweet in enumerate(tweepy.Cursor(api.search,q=hashtag,count=lim).items()):\n",
    "    if not tweet.user.id in users:       \n",
    "        csv_writer_users.writerow([\n",
    "            tweet.user._json['id'], tweet.user._json['id_str'], tweet.user._json['name'],\n",
    "            tweet.user._json['screen_name'], tweet.user._json['location'],\n",
    "            tweet.user._json['description'], tweet.user._json['url'], \n",
    "            tweet.user._json['protected'], tweet.user._json['followers_count'], \n",
    "            tweet.user._json['friends_count'], tweet.user._json['listed_count'],\n",
    "            tweet.user._json['created_at'], tweet.user._json['favourites_count'], \n",
    "            tweet.user._json['utc_offset'], tweet.user._json['time_zone'], \n",
    "            tweet.user._json['geo_enabled'], tweet.user._json['verified'], \n",
    "            tweet.user._json['statuses_count'], tweet.user._json['lang'], \n",
    "            tweet.user._json['contributors_enabled'], tweet.user._json['is_translator'], \n",
    "            tweet.user._json['is_translation_enabled'], tweet.user._json['has_extended_profile'],\n",
    "            tweet.user._json['default_profile'], tweet.user._json['default_profile_image'], \n",
    "            tweet.user._json['following'], tweet.user._json['follow_request_sent'], \n",
    "            tweet.user._json['notifications'], tweet.user._json['translator_type']\n",
    "        ])\n",
    "        \n",
    "    csv_writer_tweets.writerow([\n",
    "        tweet._json['created_at'], tweet._json['id'], tweet._json['id_str'], \n",
    "        tweet.user._json['id'], tweet.user._json['screen_name'], str(tweet.user.created_at),\n",
    "        tweet._json['text'], tweet._json['truncated'], tweet._json['in_reply_to_status_id'],\n",
    "        tweet._json['in_reply_to_status_id_str'], tweet._json['in_reply_to_user_id'],\n",
    "        tweet._json['in_reply_to_user_id_str'], tweet._json['in_reply_to_screen_name'], \n",
    "        tweet._json['geo'], tweet._json['coordinates'], tweet._json['place'],\n",
    "        tweet._json['contributors'], tweet._json['is_quote_status'], \n",
    "        tweet._json['retweet_count'], tweet._json['favorite_count'], \n",
    "        tweet._json['favorited'], tweet._json['retweeted'], tweet._json['lang']])\n",
    "    if i > lim:\n",
    "        print(f\"{i:8<}\", flush=True, end=\" \")\n",
    "        lim += steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "enabling-manner",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tweets = pd.read_csv(outfile)\n",
    "df_users = pd.read_csv(outusers)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "undefined-documentary",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_tweets\n",
    "df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n",
    "\n",
    "x = df[df[\"RT\"]==False][\"text\"].map(\n",
    "    lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n",
    "    lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n",
    "    lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n",
    "    lambda x:re.sub(\"\\s\", \"\", x)).map(\n",
    "    lambda x: x[:20])\n",
    "\n",
    "y = sorted(set(x))\n",
    "\n",
    "print(f\"uniq      {len(y)}\\noriginal  {len(df[df['RT']==False])}\\nall       {len(df['RT'])}\")\n",
    "print(\"accounts \", len(set(df[\"screen name\"])))\n",
    "\n",
    "df.tail(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "rubber-operation",
   "metadata": {},
   "outputs": [],
   "source": [
    "account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n",
    "tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n",
    "\n",
    "\n",
    "print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n",
    "\n",
    "for k in sorted(account_number.keys()):\n",
    "    print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "frequent-immunology",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sticky-expression",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "infinite-shoot",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "reserved-journalism",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import re\n",
	"import tweepy\n",
	"import csv\n",
	"import pandas as pd\n",
	"import time\n",
	"import collections\n",
	"\n",
	"consumer_key = ''\n",
	"consumer_secret = ''\n",
	"\n",
	"access_token = ''\n",
	"access_token_secret = ''\n",
	"\n",
	"\n",
	"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
	"auth.set_access_token(access_token, access_token_secret)\n",
	"api = tweepy.API(auth,wait_on_rate_limit=True)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "suffering-capitol",
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"hashtag = \"#hashtag\"\n",
	"lim = 100\n",
	"\n",
	"steps = lim\n",
	"\n",
	"current_time = time.ctime().replace(\" \",\"_\")\n",
	"dir_name = f\"./{hashtag[1:]}\"\n",
	"path = f\"./{hashtag[1:]}/{current_time}\"\n",
	"outfile = f\"{path}/out.csv\"\n",
	"outusers = f\"{path}/users.csv\"\n",
	"\n",
	"\n",
	"try:\n",
	" os.mkdir(f\"./{hashtag[1:]}\")\n",
	"except FileExistsError:\n",
	" print(\"directory exists\")\n",
	"finally:\n",
	" os.mkdir(path)\n",
	" \n",
	" \n",
	"csv_file = open(outfile, 'a')\n",
	"csv_writer_tweets = csv.writer(csv_file)\n",
	"csv_writer_tweets.writerow(\n",
	" ['created_at', 'id', 'id_str', 'user_id', 'screen name', 'account created at', 'text', 'truncated',\n",
	" 'in_reply_to_status_id', 'in_reply_to_status_id_str', \n",
	" 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', \n",
	" 'geo', 'coordinates', 'place', 'contributors', \n",
	" 'is_quote_status', 'retweet_count', 'favorite_count', \n",
	" 'favorited', 'retweeted', 'lang'])\n",
	"\n",
	"csv_users = open(outusers, 'a')\n",
	"csv_writer_users = csv.writer(csv_users)\n",
	"csv_writer_users.writerow(\n",
	" ['id', 'id_str', 'name', 'screen_name', 'location', 'description', \n",
	" 'url','protected', 'followers_count', 'friends_count', \n",
	" 'listed_count', 'created_at', 'favourites_count', 'utc_offset', \n",
	" 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', \n",
	" 'contributors_enabled', 'is_translator', 'is_translation_enabled', \n",
	" 'has_extended_profile', 'default_profile', 'default_profile_image', \n",
	" 'following', 'follow_request_sent', 'notifications', 'translator_type'])\n",
	"\n",
	"\n",
	"\n",
	"print(time.ctime())\n",
	"for i, tweet in enumerate(tweepy.Cursor(api.search,q=hashtag,count=lim).items()):\n",
	" if not tweet.user.id in users: \n",
	" csv_writer_users.writerow([\n",
	" tweet.user._json['id'], tweet.user._json['id_str'], tweet.user._json['name'],\n",
	" tweet.user._json['screen_name'], tweet.user._json['location'],\n",
	" tweet.user._json['description'], tweet.user._json['url'], \n",
	" tweet.user._json['protected'], tweet.user._json['followers_count'], \n",
	" tweet.user._json['friends_count'], tweet.user._json['listed_count'],\n",
	" tweet.user._json['created_at'], tweet.user._json['favourites_count'], \n",
	" tweet.user._json['utc_offset'], tweet.user._json['time_zone'], \n",
	" tweet.user._json['geo_enabled'], tweet.user._json['verified'], \n",
	" tweet.user._json['statuses_count'], tweet.user._json['lang'], \n",
	" tweet.user._json['contributors_enabled'], tweet.user._json['is_translator'], \n",
	" tweet.user._json['is_translation_enabled'], tweet.user._json['has_extended_profile'],\n",
	" tweet.user._json['default_profile'], tweet.user._json['default_profile_image'], \n",
	" tweet.user._json['following'], tweet.user._json['follow_request_sent'], \n",
	" tweet.user._json['notifications'], tweet.user._json['translator_type']\n",
	" ])\n",
	" \n",
	" csv_writer_tweets.writerow([\n",
	" tweet._json['created_at'], tweet._json['id'], tweet._json['id_str'], \n",
	" tweet.user._json['id'], tweet.user._json['screen_name'], str(tweet.user.created_at),\n",
	" tweet._json['text'], tweet._json['truncated'], tweet._json['in_reply_to_status_id'],\n",
	" tweet._json['in_reply_to_status_id_str'], tweet._json['in_reply_to_user_id'],\n",
	" tweet._json['in_reply_to_user_id_str'], tweet._json['in_reply_to_screen_name'], \n",
	" tweet._json['geo'], tweet._json['coordinates'], tweet._json['place'],\n",
	" tweet._json['contributors'], tweet._json['is_quote_status'], \n",
	" tweet._json['retweet_count'], tweet._json['favorite_count'], \n",
	" tweet._json['favorited'], tweet._json['retweeted'], tweet._json['lang']])\n",
	" if i > lim:\n",
	" print(f\"{i:8<}\", flush=True, end=\" \")\n",
	" lim += steps"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "enabling-manner",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_tweets = pd.read_csv(outfile)\n",
	"df_users = pd.read_csv(outusers)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "undefined-documentary",
	"metadata": {},
	"outputs": [],
	"source": [
	"df = df_tweets\n",
	"df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n",
	"\n",
	"x = df[df[\"RT\"]==False][\"text\"].map(\n",
	" lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n",
	" lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n",
	" lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n",
	" lambda x:re.sub(\"\\s\", \"\", x)).map(\n",
	" lambda x: x[:20])\n",
	"\n",
	"y = sorted(set(x))\n",
	"\n",
	"print(f\"uniq {len(y)}\\noriginal {len(df[df['RT']==False])}\\nall {len(df['RT'])}\")\n",
	"print(\"accounts \", len(set(df[\"screen name\"])))\n",
	"\n",
	"df.tail(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "rubber-operation",
	"metadata": {},
	"outputs": [],
	"source": [
	"account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n",
	"tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n",
	"\n",
	"\n",
	"print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n",
	"\n",
	"for k in sorted(account_number.keys()):\n",
	" print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "frequent-immunology",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "sticky-expression",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "infinite-shoot",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}