iamvee · January 31, 2021 21:51
diff --git a/template.ipynb b/template.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "functional-billy",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import re\n",
    "import pandas as pd\n",
    "import collections\n",
    "import scipy \n",
    "import matplotlib\n",
    "import matplotlib.pyplot\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "remarkable-casting",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tweets = pd.read_csv('./out.csv')\n",
    "df_users = pd.read_csv('./users.csv')\n",
    "\n",
    "\n",
    "df = df_tweets\n",
    "df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n",
    "\n",
    "x = df[df[\"RT\"]==False][\"text\"].map(\n",
    "    lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n",
    "    lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n",
    "    lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n",
    "    lambda x:re.sub(\"\\s\", \"\", x)).map(\n",
    "    lambda x: x[:30])\n",
    "\n",
    "y = sorted(set(x))\n",
    "\n",
    "\n",
    "\n",
    "data = {}\n",
    "data[\"original\"] = len(y)\n",
    "data[\"original+copy\"] = len(df[df['RT']==False])\n",
    "data[\"duplicated\"] = data[\"original+copy\"] - data[\"original\"]\n",
    "data[\"all tweets\"] = len(df['RT'])\n",
    "data[\"retweets\"] = data[\"all tweets\"] - data[\"original+copy\"]\n",
    "data[\"accounts\"] = len(set(df[\"screen name\"]))\n",
    "\n",
    "print(f\"\"\"\n",
    "{data[\"original\"]:>10} | original tweets (duplicated tweets excluded)\n",
    "{data[\"original+copy\"]:>10} | original tweets + (duplicated)\n",
    "{data[\"duplicated\"]:>10} | duplicated\n",
    "{data[\"retweets\"]:>10} | retweets\n",
    "{data[\"all tweets\"]:>10} | all tweets\n",
    "\n",
    "{data[\"accounts\"]:>10} | accounts\n",
    "{data[\"all tweets\"] / data[\"accounts\"]:>10.1f} | average tweets per account\n",
    "\n",
    "     start : {min(df['created_at'])}\n",
    "      stop : {max(df['created_at'])}\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "demanding-barrier",
   "metadata": {},
   "outputs": [],
   "source": [
    "matplotlib.pyplot.pie([data[\"original\"], data[\"duplicated\"], data[\"retweets\"]], \n",
    "                      labels=[f\"original\\n {100*data['original']/data['all tweets']:.2f} %\", \n",
    "                              f\"duplicated \\n {100*data['duplicated']/data['all tweets']:.2f} %\", \n",
    "                              f\"retweet\\n {100*data['retweets']/data['all tweets']:.2f} %\"]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "seasonal-husband",
   "metadata": {},
   "outputs": [],
   "source": [
    "account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n",
    "tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n",
    "\n",
    "\n",
    "print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n",
    "\n",
    "for k in sorted(account_number.keys()):\n",
    "    print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "christian-tonight",
   "metadata": {},
   "outputs": [],
   "source": [
    "matplotlib.pyplot.figure(figsize=(15, 5))\n",
    "# df['friends_count'].plot.density(xlim=[0,60000])\n",
    "df_users['followers_count'].plot.density(xlim=[0,60000])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "verified-bleeding",
   "metadata": {},
   "outputs": [],
   "source": [
    "vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n",
    "svals = sorted(vals, key=lambda s: s[0])\n",
    "wvals = [(t,c,t*c*5) for t, c in svals]\n",
    "x, y,z = list(zip(*wvals))\n",
    "\n",
    "\n",
    "matplotlib.pyplot.figure(figsize=(15, 10))\n",
    "\n",
    "matplotlib.pyplot.scatter(x, y, s=z, alpha=0.5)\n",
    "\n",
    "# matplotlib.pyplot.plot(x, y)\n",
    "matplotlib.pyplot.grid(True)\n",
    "\n",
    "matplotlib.pyplot.scatter([max(x)//2,], [max(y)//2,], s=[sum(z),], c='r', alpha=0.3)\n",
    "\n",
    "matplotlib.pyplot.xticks(range(0, max(x)+10, 50), [str(x) for x in range(0, max(x)+10, 50)])\n",
    "# matplotlib.pyplot.xlim([-1, max(x)+1])\n",
    "# x, y, z\n",
    "1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "absent-script",
   "metadata": {},
   "outputs": [],
   "source": [
    "skip_first_n =50\n",
    "\n",
    "vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n",
    "svals = sorted(vals, key=lambda s: s[0])\n",
    "wvals = [(t,c,t*c) for t, c in svals]\n",
    "x, y,z = list(zip(*wvals[skip_first_n:]))\n",
    "\n",
    "\n",
    "matplotlib.pyplot.figure(figsize=(15, 10))\n",
    "\n",
    "matplotlib.pyplot.scatter(x, y, s=z, alpha=0.3)\n",
    "\n",
    "sumz = sum(z)\n",
    "for person in [1, 2, 3, 5, 7, 15, 30, 70, 150]:\n",
    "    if person < 5:\n",
    "        matplotlib.pyplot.scatter([person,], [sumz//person,], s=[sumz,], c='g', alpha=0.3)\n",
    "    matplotlib.pyplot.scatter([person,], [sumz//person,], s=[person,], c='k',alpha=0.5)\n",
    "    matplotlib.pyplot.text(person, sumz//person, f\"{person:<3} tweets -> {sumz//person}\")\n",
    "\n",
    "\n",
    "# matplotlib.pyplot.plot(x, y)\n",
    "\n",
    "# matplotlib.pyplot.grid(True)\n",
    "matplotlib.pyplot.text(200, 500, f\"current \\n--> {sum(y)} ppl\")\n",
    "\n",
    "matplotlib.pyplot.xticks(range(0, max(x)+1, 50), [str(x) for x in range(0, max(x)+1, 50)])\n",
    "# matplotlib.pyplot.xlim([-1, max(x)+1])\n",
    "# x, y, z\n",
    "1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "promising-dover",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_users['id'].to_csv('./engh_uid.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "native-bearing",
   "metadata": {},
   "source": [
    "# dirty code "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "exempt-workplace",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "# cat engh_ids\n",
    "# cat  misazim_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "independent-prison",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('engh_ids') as f:\n",
    "    engh = set(f.read().split()[1:-1])\n",
    "    \n",
    "with open('misazim_ids') as f:\n",
    "    misz = set(f.read().split()[1:-1])\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "desirable-jacob",
   "metadata": {},
   "outputs": [],
   "source": [
    "intrs = engh.intersection(misz)\n",
    "enghu = engh - misz\n",
    "miszu = misz - engh \n",
    "\n",
    "len(intrs), len(enghu), len(miszu), "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "animated-owner",
   "metadata": {},
   "outputs": [],
   "source": [
    "engh_ids = [int(ii) for ii in enghu]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "executed-effort",
   "metadata": {},
   "outputs": [],
   "source": [
    "sx = list(df_users[df_users['id'].isin(engh_ids)][\"created_at\"])\n",
    "sy = [ssx[-4:] + \" \" + ssx[4:7] for ssx in sx]\n",
    "sorted(collections.Counter(sy).items(), key=lambda sfds:sfds[-1], reverse=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "shared-marking",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_users.created_at"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "intellectual-sandwich",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/tweets.ipynb b/tweets.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "functional-billy",
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"import re\n",
	"import pandas as pd\n",
	"import collections\n",
	"import scipy \n",
	"import matplotlib\n",
	"import matplotlib.pyplot\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "remarkable-casting",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_tweets = pd.read_csv('./out.csv')\n",
	"df_users = pd.read_csv('./users.csv')\n",
	"\n",
	"\n",
	"df = df_tweets\n",
	"df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n",
	"\n",
	"x = df[df[\"RT\"]==False][\"text\"].map(\n",
	" lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n",
	" lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n",
	" lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n",
	" lambda x:re.sub(\"\\s\", \"\", x)).map(\n",
	" lambda x: x[:30])\n",
	"\n",
	"y = sorted(set(x))\n",
	"\n",
	"\n",
	"\n",
	"data = {}\n",
	"data[\"original\"] = len(y)\n",
	"data[\"original+copy\"] = len(df[df['RT']==False])\n",
	"data[\"duplicated\"] = data[\"original+copy\"] - data[\"original\"]\n",
	"data[\"all tweets\"] = len(df['RT'])\n",
	"data[\"retweets\"] = data[\"all tweets\"] - data[\"original+copy\"]\n",
	"data[\"accounts\"] = len(set(df[\"screen name\"]))\n",
	"\n",
	"print(f\"\"\"\n",
	"{data[\"original\"]:>10} \| original tweets (duplicated tweets excluded)\n",
	"{data[\"original+copy\"]:>10} \| original tweets + (duplicated)\n",
	"{data[\"duplicated\"]:>10} \| duplicated\n",
	"{data[\"retweets\"]:>10} \| retweets\n",
	"{data[\"all tweets\"]:>10} \| all tweets\n",
	"\n",
	"{data[\"accounts\"]:>10} \| accounts\n",
	"{data[\"all tweets\"] / data[\"accounts\"]:>10.1f} \| average tweets per account\n",
	"\n",
	" start : {min(df['created_at'])}\n",
	" stop : {max(df['created_at'])}\n",
	"\"\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "demanding-barrier",
	"metadata": {},
	"outputs": [],
	"source": [
	"matplotlib.pyplot.pie([data[\"original\"], data[\"duplicated\"], data[\"retweets\"]], \n",
	" labels=[f\"original\\n {100*data['original']/data['all tweets']:.2f} %\", \n",
	" f\"duplicated \\n {100*data['duplicated']/data['all tweets']:.2f} %\", \n",
	" f\"retweet\\n {100*data['retweets']/data['all tweets']:.2f} %\"]);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "seasonal-husband",
	"metadata": {},
	"outputs": [],
	"source": [
	"account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n",
	"tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n",
	"\n",
	"\n",
	"print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n",
	"\n",
	"for k in sorted(account_number.keys()):\n",
	" print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "christian-tonight",
	"metadata": {},
	"outputs": [],
	"source": [
	"matplotlib.pyplot.figure(figsize=(15, 5))\n",
	"# df['friends_count'].plot.density(xlim=[0,60000])\n",
	"df_users['followers_count'].plot.density(xlim=[0,60000])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "verified-bleeding",
	"metadata": {},
	"outputs": [],
	"source": [
	"vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n",
	"svals = sorted(vals, key=lambda s: s[0])\n",
	"wvals = [(t,c,tc5) for t, c in svals]\n",
	"x, y,z = list(zip(*wvals))\n",
	"\n",
	"\n",
	"matplotlib.pyplot.figure(figsize=(15, 10))\n",
	"\n",
	"matplotlib.pyplot.scatter(x, y, s=z, alpha=0.5)\n",
	"\n",
	"# matplotlib.pyplot.plot(x, y)\n",
	"matplotlib.pyplot.grid(True)\n",
	"\n",
	"matplotlib.pyplot.scatter([max(x)//2,], [max(y)//2,], s=[sum(z),], c='r', alpha=0.3)\n",
	"\n",
	"matplotlib.pyplot.xticks(range(0, max(x)+10, 50), [str(x) for x in range(0, max(x)+10, 50)])\n",
	"# matplotlib.pyplot.xlim([-1, max(x)+1])\n",
	"# x, y, z\n",
	"1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "absent-script",
	"metadata": {},
	"outputs": [],
	"source": [
	"skip_first_n =50\n",
	"\n",
	"vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n",
	"svals = sorted(vals, key=lambda s: s[0])\n",
	"wvals = [(t,c,t*c) for t, c in svals]\n",
	"x, y,z = list(zip(*wvals[skip_first_n:]))\n",
	"\n",
	"\n",
	"matplotlib.pyplot.figure(figsize=(15, 10))\n",
	"\n",
	"matplotlib.pyplot.scatter(x, y, s=z, alpha=0.3)\n",
	"\n",
	"sumz = sum(z)\n",
	"for person in [1, 2, 3, 5, 7, 15, 30, 70, 150]:\n",
	" if person < 5:\n",
	" matplotlib.pyplot.scatter([person,], [sumz//person,], s=[sumz,], c='g', alpha=0.3)\n",
	" matplotlib.pyplot.scatter([person,], [sumz//person,], s=[person,], c='k',alpha=0.5)\n",
	" matplotlib.pyplot.text(person, sumz//person, f\"{person:<3} tweets -> {sumz//person}\")\n",
	"\n",
	"\n",
	"# matplotlib.pyplot.plot(x, y)\n",
	"\n",
	"# matplotlib.pyplot.grid(True)\n",
	"matplotlib.pyplot.text(200, 500, f\"current \\n--> {sum(y)} ppl\")\n",
	"\n",
	"matplotlib.pyplot.xticks(range(0, max(x)+1, 50), [str(x) for x in range(0, max(x)+1, 50)])\n",
	"# matplotlib.pyplot.xlim([-1, max(x)+1])\n",
	"# x, y, z\n",
	"1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "promising-dover",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_users['id'].to_csv('./engh_uid.csv')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "native-bearing",
	"metadata": {},
	"source": [
	"# dirty code "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "exempt-workplace",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%bash\n",
	"# cat engh_ids\n",
	"# cat misazim_ids"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "independent-prison",
	"metadata": {},
	"outputs": [],
	"source": [
	"with open('engh_ids') as f:\n",
	" engh = set(f.read().split()[1:-1])\n",
	" \n",
	"with open('misazim_ids') as f:\n",
	" misz = set(f.read().split()[1:-1])\n",
	" \n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "desirable-jacob",
	"metadata": {},
	"outputs": [],
	"source": [
	"intrs = engh.intersection(misz)\n",
	"enghu = engh - misz\n",
	"miszu = misz - engh \n",
	"\n",
	"len(intrs), len(enghu), len(miszu), "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "animated-owner",
	"metadata": {},
	"outputs": [],
	"source": [
	"engh_ids = [int(ii) for ii in enghu]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "executed-effort",
	"metadata": {},
	"outputs": [],
	"source": [
	"sx = list(df_users[df_users['id'].isin(engh_ids)][\"created_at\"])\n",
	"sy = [ssx[-4:] + \" \" + ssx[4:7] for ssx in sx]\n",
	"sorted(collections.Counter(sy).items(), key=lambda sfds:sfds[-1], reverse=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "shared-marking",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_users.created_at"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "intellectual-sandwich",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}