Skip to content

Instantly share code, notes, and snippets.

@zxdawn
Created March 15, 2022 14:52
Show Gist options
  • Save zxdawn/a20d049300c382f289a2e2b0f786b124 to your computer and use it in GitHub Desktop.
Save zxdawn/a20d049300c382f289a2e2b0f786b124 to your computer and use it in GitHub Desktop.
The notebook of cluster lightning stroke data into flash data.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook aims to cluster the GLD360 lightning stroke data into flash data."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.cluster import DBSCAN\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read daily GLD360 density data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../data/gld360/Arctic Lightning August 10-13.csv')\n",
"df['time'] = pd.to_datetime(df['time'])\n",
"\n",
"# convert datetime into seconds\n",
"df['delta'] = (df['time']-df['time'].min()).dt.total_seconds()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cluster by time (500 ms)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"db = DBSCAN(eps=0.5, min_samples=1, algorithm='ball_tree').fit(df['delta'].values.reshape(-1, 1))\n",
"cluster_labels = db.labels_\n",
"# save to new column named \"time_label\"\n",
"df['time_label'] = cluster_labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cluster by location (20 km)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def db_loc(df):\n",
" coords = df[['latitude', 'longitude']].values\n",
" kms_per_radian = 6371.0088\n",
"\n",
" # search for 20km around each lightning dots\n",
" epsilon = 20/kms_per_radian\n",
"\n",
" db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree').fit(np.radians(coords))\n",
" cluster_labels = db.labels_\n",
" return pd.Series(cluster_labels)\n",
"\n",
"# save to new column named \"loc_label\"\n",
"df['loc_label'] = df.groupby('time_label').apply(lambda row: db_loc(row)).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# group by two condition\n",
"comb_grp = df.groupby(['time_label','loc_label'])\n",
"\n",
"# use the first datetime of stroke as the flash time\n",
"df_flash = comb_grp[['time','longitude','latitude']].nth(0).reset_index(drop=True)\n",
"\n",
"# if any stroke is CG, the flash is CG\n",
"cloud_flag = comb_grp['cloud'].all().reset_index(drop=True).rename('cloud')\n",
"df_flash['cloud'] = cloud_flag"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# # save the data\n",
"# df_flash.to_csv('arctic_flash_test.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Case"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>time</th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>signalStrengthKA</th>\n",
" <th>cloud</th>\n",
" <th>delta</th>\n",
" <th>time_label</th>\n",
" <th>loc_label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>2019-08-12 13:22:35.924000+00:00</td>\n",
" <td>90.0627</td>\n",
" <td>78.8102</td>\n",
" <td>-36.4</td>\n",
" <td>False</td>\n",
" <td>219810.582</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1601</th>\n",
" <td>2019-08-12 13:22:35.954000+00:00</td>\n",
" <td>89.9164</td>\n",
" <td>78.7724</td>\n",
" <td>-8.1</td>\n",
" <td>False</td>\n",
" <td>219810.612</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1602</th>\n",
" <td>2019-08-12 13:22:35.993000+00:00</td>\n",
" <td>89.7954</td>\n",
" <td>78.8150</td>\n",
" <td>-17.0</td>\n",
" <td>False</td>\n",
" <td>219810.651</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1603</th>\n",
" <td>2019-08-12 13:22:36.020000+00:00</td>\n",
" <td>89.9286</td>\n",
" <td>78.7988</td>\n",
" <td>-27.5</td>\n",
" <td>False</td>\n",
" <td>219810.678</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1604</th>\n",
" <td>2019-08-12 13:22:36.047000+00:00</td>\n",
" <td>89.8578</td>\n",
" <td>78.7879</td>\n",
" <td>-14.4</td>\n",
" <td>False</td>\n",
" <td>219810.705</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1605</th>\n",
" <td>2019-08-12 13:22:36.093000+00:00</td>\n",
" <td>89.1329</td>\n",
" <td>78.7320</td>\n",
" <td>22.5</td>\n",
" <td>False</td>\n",
" <td>219810.751</td>\n",
" <td>848</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1606</th>\n",
" <td>2019-08-12 13:22:36.107000+00:00</td>\n",
" <td>89.8719</td>\n",
" <td>78.7802</td>\n",
" <td>5.5</td>\n",
" <td>True</td>\n",
" <td>219810.765</td>\n",
" <td>848</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" time longitude latitude signalStrengthKA \\\n",
"1600 2019-08-12 13:22:35.924000+00:00 90.0627 78.8102 -36.4 \n",
"1601 2019-08-12 13:22:35.954000+00:00 89.9164 78.7724 -8.1 \n",
"1602 2019-08-12 13:22:35.993000+00:00 89.7954 78.8150 -17.0 \n",
"1603 2019-08-12 13:22:36.020000+00:00 89.9286 78.7988 -27.5 \n",
"1604 2019-08-12 13:22:36.047000+00:00 89.8578 78.7879 -14.4 \n",
"1605 2019-08-12 13:22:36.093000+00:00 89.1329 78.7320 22.5 \n",
"1606 2019-08-12 13:22:36.107000+00:00 89.8719 78.7802 5.5 \n",
"\n",
" cloud delta time_label loc_label \n",
"1600 False 219810.582 848 0 \n",
"1601 False 219810.612 848 0 \n",
"1602 False 219810.651 848 0 \n",
"1603 False 219810.678 848 0 \n",
"1604 False 219810.705 848 0 \n",
"1605 False 219810.751 848 1 \n",
"1606 True 219810.765 848 0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['time_label']==848]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots()\n",
"\n",
"test = df[(df['time_label']==848)&(df['loc_label']==0)]\n",
"\n",
"# plot the flash location\n",
"df_flash.loc[df_flash['time']==test.iloc[0]['time']].plot.scatter(ax=ax, x='longitude', y='latitude', c='r', s=50)\n",
"\n",
"# plot the stroke location and time\n",
"test.plot.scatter(x='longitude', y='latitude', ax=ax)\n",
"for i, point in test.iterrows():\n",
" ax.text(point['longitude'], point['latitude'], point['time'].strftime('%H:%M:%S:%f'))"
]
}
],
"metadata": {
"interpreter": {
"hash": "5c59143dddf9bb7cb19cf5c4e65abac3edd3adb579b9c354e95a8774ecff661b"
},
"kernelspec": {
"display_name": "Python 3.9.7 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment