Last active
January 31, 2021 21:51
-
-
Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "functional-billy", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"import collections\n", | |
"import scipy \n", | |
"import matplotlib\n", | |
"import matplotlib.pyplot\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "remarkable-casting", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_tweets = pd.read_csv('./out.csv')\n", | |
"df_users = pd.read_csv('./users.csv')\n", | |
"\n", | |
"\n", | |
"df = df_tweets\n", | |
"df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n", | |
"\n", | |
"x = df[df[\"RT\"]==False][\"text\"].map(\n", | |
" lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n", | |
" lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n", | |
" lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n", | |
" lambda x:re.sub(\"\\s\", \"\", x)).map(\n", | |
" lambda x: x[:30])\n", | |
"\n", | |
"y = sorted(set(x))\n", | |
"\n", | |
"\n", | |
"\n", | |
"data = {}\n", | |
"data[\"original\"] = len(y)\n", | |
"data[\"original+copy\"] = len(df[df['RT']==False])\n", | |
"data[\"duplicated\"] = data[\"original+copy\"] - data[\"original\"]\n", | |
"data[\"all tweets\"] = len(df['RT'])\n", | |
"data[\"retweets\"] = data[\"all tweets\"] - data[\"original+copy\"]\n", | |
"data[\"accounts\"] = len(set(df[\"screen name\"]))\n", | |
"\n", | |
"print(f\"\"\"\n", | |
"{data[\"original\"]:>10} | original tweets (duplicated tweets excluded)\n", | |
"{data[\"original+copy\"]:>10} | original tweets + (duplicated)\n", | |
"{data[\"duplicated\"]:>10} | duplicated\n", | |
"{data[\"retweets\"]:>10} | retweets\n", | |
"{data[\"all tweets\"]:>10} | all tweets\n", | |
"\n", | |
"{data[\"accounts\"]:>10} | accounts\n", | |
"{data[\"all tweets\"] / data[\"accounts\"]:>10.1f} | average tweets per account\n", | |
"\n", | |
" start : {min(df['created_at'])}\n", | |
" stop : {max(df['created_at'])}\n", | |
"\"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "demanding-barrier", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"matplotlib.pyplot.pie([data[\"original\"], data[\"duplicated\"], data[\"retweets\"]], \n", | |
" labels=[f\"original\\n {100*data['original']/data['all tweets']:.2f} %\", \n", | |
" f\"duplicated \\n {100*data['duplicated']/data['all tweets']:.2f} %\", \n", | |
" f\"retweet\\n {100*data['retweets']/data['all tweets']:.2f} %\"]);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "seasonal-husband", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n", | |
"tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n", | |
"\n", | |
"\n", | |
"print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n", | |
"\n", | |
"for k in sorted(account_number.keys()):\n", | |
" print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "christian-tonight", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"matplotlib.pyplot.figure(figsize=(15, 5))\n", | |
"# df['friends_count'].plot.density(xlim=[0,60000])\n", | |
"df_users['followers_count'].plot.density(xlim=[0,60000])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "verified-bleeding", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n", | |
"svals = sorted(vals, key=lambda s: s[0])\n", | |
"wvals = [(t,c,t*c*5) for t, c in svals]\n", | |
"x, y,z = list(zip(*wvals))\n", | |
"\n", | |
"\n", | |
"matplotlib.pyplot.figure(figsize=(15, 10))\n", | |
"\n", | |
"matplotlib.pyplot.scatter(x, y, s=z, alpha=0.5)\n", | |
"\n", | |
"# matplotlib.pyplot.plot(x, y)\n", | |
"matplotlib.pyplot.grid(True)\n", | |
"\n", | |
"matplotlib.pyplot.scatter([max(x)//2,], [max(y)//2,], s=[sum(z),], c='r', alpha=0.3)\n", | |
"\n", | |
"matplotlib.pyplot.xticks(range(0, max(x)+10, 50), [str(x) for x in range(0, max(x)+10, 50)])\n", | |
"# matplotlib.pyplot.xlim([-1, max(x)+1])\n", | |
"# x, y, z\n", | |
"1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "absent-script", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"skip_first_n =50\n", | |
"\n", | |
"vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n", | |
"svals = sorted(vals, key=lambda s: s[0])\n", | |
"wvals = [(t,c,t*c) for t, c in svals]\n", | |
"x, y,z = list(zip(*wvals[skip_first_n:]))\n", | |
"\n", | |
"\n", | |
"matplotlib.pyplot.figure(figsize=(15, 10))\n", | |
"\n", | |
"matplotlib.pyplot.scatter(x, y, s=z, alpha=0.3)\n", | |
"\n", | |
"sumz = sum(z)\n", | |
"for person in [1, 2, 3, 5, 7, 15, 30, 70, 150]:\n", | |
" if person < 5:\n", | |
" matplotlib.pyplot.scatter([person,], [sumz//person,], s=[sumz,], c='g', alpha=0.3)\n", | |
" matplotlib.pyplot.scatter([person,], [sumz//person,], s=[person,], c='k',alpha=0.5)\n", | |
" matplotlib.pyplot.text(person, sumz//person, f\"{person:<3} tweets -> {sumz//person}\")\n", | |
"\n", | |
"\n", | |
"# matplotlib.pyplot.plot(x, y)\n", | |
"\n", | |
"# matplotlib.pyplot.grid(True)\n", | |
"matplotlib.pyplot.text(200, 500, f\"current \\n--> {sum(y)} ppl\")\n", | |
"\n", | |
"matplotlib.pyplot.xticks(range(0, max(x)+1, 50), [str(x) for x in range(0, max(x)+1, 50)])\n", | |
"# matplotlib.pyplot.xlim([-1, max(x)+1])\n", | |
"# x, y, z\n", | |
"1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "promising-dover", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_users['id'].to_csv('./engh_uid.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "native-bearing", | |
"metadata": {}, | |
"source": [ | |
"# dirty code " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "exempt-workplace", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%bash\n", | |
"# cat engh_ids\n", | |
"# cat misazim_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "independent-prison", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('engh_ids') as f:\n", | |
" engh = set(f.read().split()[1:-1])\n", | |
" \n", | |
"with open('misazim_ids') as f:\n", | |
" misz = set(f.read().split()[1:-1])\n", | |
" \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "desirable-jacob", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"intrs = engh.intersection(misz)\n", | |
"enghu = engh - misz\n", | |
"miszu = misz - engh \n", | |
"\n", | |
"len(intrs), len(enghu), len(miszu), " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "animated-owner", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"engh_ids = [int(ii) for ii in enghu]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "executed-effort", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sx = list(df_users[df_users['id'].isin(engh_ids)][\"created_at\"])\n", | |
"sy = [ssx[-4:] + \" \" + ssx[4:7] for ssx in sx]\n", | |
"sorted(collections.Counter(sy).items(), key=lambda sfds:sfds[-1], reverse=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "shared-marking", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_users.created_at" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "intellectual-sandwich", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment