Skip to content

Instantly share code, notes, and snippets.

@buswedg
Created April 14, 2020 00:02
Show Gist options
  • Save buswedg/aebd1010edfa809d82e2ea7789eb0ebc to your computer and use it in GitHub Desktop.
Save buswedg/aebd1010edfa809d82e2ea7789eb0ebc to your computer and use it in GitHub Desktop.
predicting_motogp_winners_revisited\supervised_learning_revisited
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predicting MotoGP winners (revisited)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Supervised Learning (revisited)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"#pd.options.mode.chained_assignment = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Reading in the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df_motogpsession = pd.read_csv('data/motogpsession.tsv', sep='\\t', encoding='utf-8')\n",
"df_motogpqresult = pd.read_csv('data/motogpqresult.tsv', sep='\\t', encoding='utf-8')\n",
"df_motogprresult = pd.read_csv('data/motogprresult.tsv', sep='\\t', encoding='utf-8')\n",
"df_motogprider = pd.read_csv('data/motogprider.tsv', sep='\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_motogpsession = df_motogpsession.loc[:, ~df_motogpsession.columns.str.contains('^Unnamed')]\n",
"df_motogpqresult = df_motogpqresult.loc[:, ~df_motogpqresult.columns.str.contains('^Unnamed')]\n",
"df_motogprresult = df_motogprresult.loc[:, ~df_motogprresult.columns.str.contains('^Unnamed')]\n",
"df_motogprider = df_motogprider.loc[:, ~df_motogprider.columns.str.contains('^Unnamed')]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dict_motogpdata = {}\n",
"\n",
"dict_motogpdata['session'] = df_motogpsession\n",
"dict_motogpdata['qresult'] = df_motogpqresult\n",
"dict_motogpdata['rresult'] = df_motogprresult\n",
"dict_motogpdata['rider'] = df_motogprider"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Generate our set of features and label"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def generate_labelfeat(dict_motodata):\n",
" \"\"\" \"\"\"\n",
"\n",
" df_motosession = dict_motodata['session']\n",
" df_motoqresult = dict_motodata['qresult']\n",
" df_motorresult = dict_motodata['rresult']\n",
" df_motorider = dict_motodata['rider']\n",
"\n",
" # Create dictonary for session id to session type\n",
" dict_sessionidsession = df_motosession.set_index('sessionId')['sessionSession'].to_dict()\n",
" # print(dict_sessionidsession)\n",
"\n",
" # Create dictonary for session id to race session id\n",
" dict_sessionidracsessionid = {}\n",
" for index, row in df_motosession.iterrows():\n",
" sessionid = row['sessionId']\n",
" sessionseason = row['sessionSeason']\n",
" sessioncountry = row['sessionCountry']\n",
"\n",
" df_temp1 = df_motosession[(df_motosession['sessionSeason'] == sessionseason) & \\\n",
" (df_motosession['sessionCountry'] == sessioncountry) & \\\n",
" (df_motosession['sessionSession'] == 'RAC2')]\n",
" \n",
" df_temp2 = df_motosession[(df_motosession['sessionSeason'] == sessionseason) & \\\n",
" (df_motosession['sessionCountry'] == sessioncountry) & \\\n",
" (df_motosession['sessionSession'] == 'RAC')]\n",
"\n",
" if len(df_temp1) > 0:\n",
" dict_sessionidracsessionid[sessionid] = df_temp1['sessionId'].values[0]\n",
" \n",
" elif len(df_temp2) > 0:\n",
" dict_sessionidracsessionid[sessionid] = df_temp2['sessionId'].values[0]\n",
" \n",
" else:\n",
" # print(sessionid, sessionseason, sessioncountry)\n",
" dict_sessionidracsessionid[sessionid] = np.nan\n",
"\n",
" # Copy qualifying result dataframe\n",
" df_temp = df_motoqresult.copy()\n",
"\n",
" # Add session type to qresults\n",
" df_temp['sessionId2'] = df_temp['sessionId']\n",
" df_temp['sessionId2'] = df_temp['sessionId2'].replace(dict_sessionidsession)\n",
" df_temp = df_temp.rename(columns={'sessionId2': 'sessionSession'})\n",
"\n",
" # Add race session id\n",
" df_temp['sessionId3'] = df_temp['sessionId']\n",
" df_temp['sessionId3'] = df_temp['sessionId3'].replace(dict_sessionidracsessionid)\n",
" df_temp = df_temp.rename(columns={'sessionId3': 'racsessionId'})\n",
"\n",
" # Drop records which are missing race session id\n",
" df_temp = df_temp.dropna(subset=['racsessionId'])\n",
"\n",
" # Crete new race session id + rider id index\n",
" df_temp['racsessionriderId'] = df_temp['racsessionId'].map(str) + '_' + df_temp['riderId'].map(str)\n",
" df_temp = df_temp.drop(['sessionId', 'riderId', 'racsessionId'], 1)\n",
"\n",
" # Pivot table on new index and to new fields as needed\n",
" df_temp = df_temp.pivot(index='racsessionriderId', columns='sessionSession')\n",
" df_temp.columns = [str(x) + str(y) for x, y in list(df_temp.columns)]\n",
" df_temp = df_temp.reset_index()\n",
" df_ids = df_temp.racsessionriderId.str.split('_').apply(pd.Series)\n",
" df_ids.columns = ['racsessionId', 'riderId']\n",
" df_temp = pd.concat([df_temp, df_ids], axis=1)\n",
"\n",
" # Drop race session id + rider id index\n",
" df_temp = df_temp.drop('racsessionriderId', 1)\n",
" df_temp = df_temp.rename(columns={'racsessionId': 'sessionId'})\n",
" df_temp[['sessionId', 'riderId']] = df_temp[['sessionId', 'riderId']].astype(float)\n",
"\n",
" # Merge race result, rider and session data\n",
" df_temp = pd.merge(df_temp, df_motorresult, on=['riderId', 'sessionId'], how='left')\n",
" df_temp = pd.merge(df_temp, df_motorider, on='riderId', how='left')\n",
" df_temp = pd.merge(df_temp, df_motosession, on='sessionId', how='left')\n",
"\n",
" df_index = df_temp[['sessionId', 'sessionSeason', 'sessionCountry', 'riderId', 'riderName']].copy()\n",
"\n",
" # Extract sessionId\n",
" list_sessionId = df_temp['sessionId']\n",
"\n",
" # Drop unnecessary and non-feature fields\n",
" df_temp = df_temp.drop(['riderId',\n",
" 'riderNumber',\n",
" 'rresultTotaltime',\n",
" 'rresultAvgspeed',\n",
" 'sessionId',\n",
" 'sessionSeason',\n",
" 'sessionClass',\n",
" 'sessionCountry',\n",
" 'sessionSession',\n",
" 'sessionDate'], 1)\n",
"\n",
" # Insert sessionId back as first column in features dataframe\n",
" df_temp.insert(0, 'sessionId', list_sessionId)\n",
"\n",
" # Convert race win result to 0/1\n",
" df_temp['rresultWin'] = df_temp['rresultPlace']\n",
" df_temp['rresultWin'][df_temp['rresultWin'] > 1] = 0\n",
" df_temp['rresultWin'] = df_temp['rresultWin'].fillna(0)\n",
" \n",
" # Convert race podium result to 0/1\n",
" df_temp['rresultPodium'] = df_temp['rresultPlace']\n",
" df_temp['rresultPodium'][df_temp['rresultPodium'] <= 3] = 1\n",
" df_temp['rresultPodium'][df_temp['rresultPodium'] > 3] = 0\n",
" df_temp['rresultPodium'] = df_temp['rresultPodium'].fillna(0)\n",
"\n",
" # Convert top six result to 0/1\n",
" df_temp['rresultTopsix'] = df_temp['rresultPlace']\n",
" df_temp['rresultTopsix'][df_temp['rresultTopsix'] <= 6] = 1\n",
" df_temp['rresultTopsix'][df_temp['rresultTopsix'] > 6] = 0\n",
" df_temp['rresultTopsix'] = df_temp['rresultTopsix'].fillna(0)\n",
" \n",
" # Convert race performance to 1/2/3\n",
" df_temp['rresultPerformance'] = df_temp['rresultPlace']\n",
" df_temp['rresultPerformance'][df_temp['rresultPerformance'] <= 3] = 1\n",
" df_temp['rresultPerformance'][(df_temp['rresultPerformance'] > 3) & (df_temp['rresultPerformance'] <= 6)] = 2\n",
" df_temp['rresultPerformance'][df_temp['rresultPerformance'] > 6] = 3\n",
" df_temp['rresultPerformance'] = df_temp['rresultPerformance'].fillna(3)\n",
" \n",
" # Extract label and features dataframe\n",
" df_motogplabel = df_temp[['rresultPlace', 'rresultWin', 'rresultPodium', 'rresultTopsix', 'rresultPerformance']]\n",
" df_motogpfeatures = df_temp.drop(['rresultPlace', 'rresultWin', 'rresultPodium', 'rresultTopsix', 'rresultPerformance'], 1)\n",
"\n",
" return df_motogplabel, df_motogpfeatures"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:98: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:103: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:104: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:109: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:110: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:115: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:116: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:117: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"df_motogplabel, df_motogpfeatures = generate_labelfeat(dict_motogpdata)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"71\n"
]
}
],
"source": [
"print(len(df_motogpfeatures.columns))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def convertdatetime(dt):\n",
" \"\"\" \"\"\"\n",
"\n",
" import re\n",
" import numpy as np\n",
"\n",
" from datetime import datetime\n",
"\n",
" dt = str(dt)\n",
"\n",
" if dt == 'None':\n",
" return np.NaN\n",
"\n",
" else:\n",
" f = '\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{6}'\n",
" r = re.compile(f)\n",
" if r.match(dt) is None:\n",
" dt = dt + '.000000'\n",
"\n",
" try:\n",
" f = '%Y-%m-%d %H:%M:%S.%f'\n",
" a = datetime.strptime(dt, f)\n",
" b = datetime(1900, 1, 1)\n",
" except:\n",
" return np.NaN\n",
"\n",
" return (a - b).total_seconds()\n",
"\n",
" \n",
"def preprocess_features(df_in):\n",
" \"\"\" \"\"\"\n",
"\n",
" import pandas as pd\n",
"\n",
" list_ignorecolumns = ['sessionId']\n",
"\n",
" list_timecolumns = ['qresultBesttimeFP',\n",
" 'qresultBesttimeFP1',\n",
" 'qresultBesttimeFP2',\n",
" 'qresultBesttimeFP3',\n",
" 'qresultBesttimeFP4',\n",
" 'qresultBesttimeQP',\n",
" 'qresultBesttimeQP1',\n",
" 'qresultBesttimeQP2',\n",
" 'qresultBesttimeQ1',\n",
" 'qresultBesttimeQ2',\n",
" 'qresultBesttimeWUP',\n",
" 'qresultBesttimeWUP2']\n",
"\n",
" df_out = pd.DataFrame(index=df_in.index)\n",
" \n",
" for col_name, col_values in df_in.iteritems():\n",
" if col_name in list_ignorecolumns:\n",
" col_values = col_values\n",
"\n",
" elif col_name in list_timecolumns:\n",
" col_values = col_values.astype(str)\n",
" col_values = col_values.apply(convertdatetime)\n",
"\n",
" elif col_values.dtype == object:\n",
" col_values = col_values.replace(['yes', 'no'], [1, 0])\n",
" col_values = pd.get_dummies(col_values, prefix=col_name)\n",
" df_out = df_out.join(col_values)\n",
"\n",
" df_out = df_out.fillna(0)\n",
"\n",
" return df_out"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df_motogpallfeatures = preprocess_features(df_motogpfeatures)\n",
"\n",
"df_motogpallnoidfeatures = df_motogpallfeatures.drop('sessionId', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4776\n"
]
}
],
"source": [
"print(len(df_motogpallnoidfeatures))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the final feature dataframe"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"list_motogpkbestfeatures = ['sessionId',\n",
" 'qresultPlaceFP',\n",
" 'qresultPlaceFP1',\n",
" 'qresultPlaceFP2',\n",
" 'qresultPlaceFP3',\n",
" 'qresultPlaceFP4',\n",
" 'qresultPlaceQ1',\n",
" 'qresultPlaceQ2',\n",
" 'qresultPlaceQP',\n",
" 'qresultPlaceQP1',\n",
" 'qresultPlaceQP2',\n",
" 'qresultPlaceWUP',\n",
" 'qresultBestlapFP',\n",
" 'qresultBestlapFP1',\n",
" 'qresultBestlapFP2',\n",
" 'qresultBestlapFP3',\n",
" 'qresultBestlapFP4',\n",
" 'qresultBestlapQ1',\n",
" 'qresultBestlapQ2',\n",
" 'qresultBestlapQP',\n",
" 'qresultBestlapQP1',\n",
" 'qresultBestlapQP2',\n",
" 'qresultBestlapWUP',\n",
" 'qresultTopspeedFP',\n",
" 'qresultTopspeedFP1',\n",
" 'qresultTopspeedFP2',\n",
" 'qresultTopspeedFP3',\n",
" 'qresultTopspeedFP4',\n",
" 'qresultTopspeedQ1',\n",
" 'qresultTopspeedQ2',\n",
" 'qresultTopspeedQP',\n",
" 'qresultTopspeedQP1',\n",
" 'qresultTopspeedQP2',\n",
" 'qresultTopspeedWUP',\n",
" 'qresultTotallapFP',\n",
" 'qresultTotallapFP1',\n",
" 'qresultTotallapFP2',\n",
" 'qresultTotallapFP3',\n",
" 'qresultTotallapFP4',\n",
" 'qresultTotallapQ1',\n",
" 'qresultTotallapQ2',\n",
" 'qresultTotallapQP',\n",
" 'qresultTotallapQP1',\n",
" 'qresultTotallapQP2',\n",
" 'qresultTotallapWUP',\n",
" 'qresultBesttimeFP',\n",
" 'qresultBesttimeFP1',\n",
" 'qresultBesttimeFP2',\n",
" 'qresultBesttimeFP3',\n",
" 'qresultBesttimeFP4',\n",
" 'qresultBesttimeQ1',\n",
" 'qresultBesttimeQ2',\n",
" 'qresultBesttimeQP',\n",
" 'qresultBesttimeQP1',\n",
" 'qresultBesttimeQP2',\n",
" 'qresultBesttimeWUP']\n",
"\n",
"df_motogpkbestfeatures = df_motogpallfeatures[list_motogpkbestfeatures]\n",
"\n",
"df_motogpkbestnoidfeatures = df_motogpkbestfeatures.drop('sessionId', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Apply standard and minmax scaling to features"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"scaler = StandardScaler()\n",
"\n",
"scaler.fit(df_motogpkbestnoidfeatures)\n",
"\n",
"df_motogpkbestnoidstdscaledfeatures = pd.DataFrame(scaler.transform(df_motogpkbestnoidfeatures), \n",
" columns=df_motogpkbestnoidfeatures.columns)\n",
"\n",
"\n",
"scaler = MinMaxScaler(feature_range=[0,100])\n",
"\n",
"scaler.fit(df_motogpkbestnoidfeatures)\n",
"\n",
"df_motogpkbestnoidmmscaledfeatures = pd.DataFrame(scaler.transform(df_motogpkbestnoidfeatures), \n",
" columns=df_motogpkbestnoidfeatures.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Generate dimensionality reduced features"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.decomposition import FastICA\n",
"from sklearn.random_projection import GaussianRandomProjection\n",
"\n",
"from sklearn.cluster import KMeans\n",
"\n",
"np.random.seed(0)\n",
"\n",
"ls_comp = [2, 3, 4]\n",
"ls_clust = [2, 3, 4]\n",
"\n",
"#df_motogpfeatures = df_motogpkbestnoidstdscaledfeatures\n",
"df_motogpfeatures = df_motogpkbestnoidmmscaledfeatures\n",
"\n",
"df_motogpclustfeatures = df_motogpkbestfeatures[['sessionId']].copy()\n",
"df_motogpkbestswclustfeatures = df_motogpkbestfeatures.copy()\n",
"\n",
"\n",
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n",
" pca = PCA(n_components=i, whiten=True).fit(df_motogpfeatures)\n",
"\n",
" df_reduced_data = pd.DataFrame(pca.transform(df_motogpfeatures))\n",
"\n",
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n",
" cluster_labels = clusterer.labels_\n",
"\n",
" df_motogpclustfeatures['PCA_' + str(i) + '_' + str(j)] = cluster_labels\n",
" df_motogpkbestswclustfeatures['PCA_' + str(i) + '_' + str(j)] = cluster_labels\n",
" \n",
" \n",
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n",
" ica = FastICA(n_components=i).fit(df_motogpfeatures)\n",
"\n",
" df_reduced_data = pd.DataFrame(pca.transform(df_motogpfeatures))\n",
"\n",
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n",
" cluster_labels = clusterer.labels_\n",
"\n",
" df_motogpclustfeatures['ICA_' + str(i) + '_' + str(j)] = cluster_labels\n",
" df_motogpkbestswclustfeatures['ICA_' + str(i) + '_' + str(j)] = cluster_labels\n",
"\n",
" \n",
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n",
" rca = GaussianRandomProjection(n_components=i, random_state=10).fit(df_motogpfeatures)\n",
"\n",
" df_reduced_data = pd.DataFrame(rca.transform(df_motogpfeatures))\n",
"\n",
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n",
" cluster_labels = clusterer.labels_\n",
"\n",
" df_motogpclustfeatures['RP_' + str(i) + '_' + str(j)] = cluster_labels\n",
" df_motogpkbestswclustfeatures['RP_' + str(i) + '_' + str(j)] = cluster_labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create some helpers for our supervised learning routines"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def shuffle_split_data(y_true_all, X_all, test_size):\n",
" \"\"\" \"\"\"\n",
"\n",
" from sklearn.model_selection import train_test_split\n",
"\n",
" X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=test_size)\n",
"\n",
" return X_train, y_true_train, X_test, y_true_test"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def eval_clf(clf, X, y_true, metric):\n",
" \"\"\" \"\"\"\n",
"\n",
" y_pred = clf.predict_proba(X[:, 1:])\n",
" y_pred = convert_pred(X[:, 0], y_pred[:, 1])\n",
"\n",
" score = performance_metric(y_true, y_pred, metric)\n",
"\n",
" return score"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def convert_pred(sessionId, y_pred_orig):\n",
" \"\"\" \"\"\"\n",
"\n",
" import pandas as pd\n",
"\n",
" df_temp = pd.DataFrame({'sessionId': sessionId, 'y_pred_orig': y_pred_orig})\n",
"\n",
" df_temp['y_pred_adj'] = 0\n",
"\n",
" for s in df_temp['sessionId'].unique():\n",
" max_prob = df_temp[df_temp['sessionId'] == s]['y_pred_orig'].max()\n",
" if max_prob >= 0.5:\n",
" df_temp.loc[(df_temp['sessionId'] == s) & (df_temp['y_pred_orig'] == max_prob), 'y_pred_adj'] = 1\n",
"\n",
" y_pred_adj = df_temp['y_pred_adj'].values\n",
"\n",
" return y_pred_adj"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def performance_metric(y_true, y_pred, metric):\n",
" \"\"\" \"\"\"\n",
"\n",
" from sklearn.metrics import accuracy_score\n",
" from sklearn.metrics import f1_score\n",
" from sklearn.metrics import recall_score\n",
" from sklearn.metrics import precision_score\n",
"\n",
" if metric == 'accuracy':\n",
" score = accuracy_score(y_true, y_pred)\n",
" elif metric == 'f1':\n",
" score = f1_score(y_true, y_pred)\n",
" elif metric == 'recall':\n",
" score = recall_score(y_true, y_pred)\n",
" elif metric == 'precision':\n",
" score = precision_score(y_true, y_pred)\n",
"\n",
" return score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perform gridsearch cross validation optimization"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def build_clf_list(clf_select):\n",
" list_ref = []\n",
" list_clf = []\n",
" list_param = []\n",
"\n",
" if 1 in clf_select:\n",
" ref = 'mmscale'\n",
" clf = 'MinMaxScaler()'\n",
" dict_param = {}\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 2 in clf_select:\n",
" ref = 'stdscale'\n",
" clf = 'StandardScaler()'\n",
" dict_param = {}\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 3 in clf_select:\n",
" ref = 'skb'\n",
" clf = 'SelectKBest()'\n",
" dict_param = {'k': [2, 4, 6, 8, 10, 12, 14, 16, 'all']}\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 4 in clf_select:\n",
" ref = 'naive'\n",
" clf = 'GaussianNB()'\n",
" dict_param = {}\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 5 in clf_select:\n",
" # http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html\n",
" ref = 'dt'\n",
" clf = 'DecisionTreeClassifier()'\n",
" dict_param = {'criterion': ['gini', 'entropy'], # default='gini'\n",
" 'splitter': ['random', 'best'], # default='best'\n",
" 'max_depth': [1, 2, 3, 4, 5, 6, 7], # default=None\n",
" 'max_features': ['auto', None]} # default=None\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 6 in clf_select:\n",
" # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html\n",
" ref = 'dtb'\n",
" clf = 'AdaBoostClassifier(DecisionTreeClassifier())'\n",
" dict_param = {'base_estimator__max_depth': [1, 2, 3, 4, 5, 6, 7], # default=None\n",
" 'n_estimators': [10, 15, 20, 25, 30, 35, 40], # default=50\n",
" 'learning_rate': [0.001, 0.01, 0.1, 1.0]} # default=1.\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 7 in clf_select:\n",
" # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html\n",
" ref = 'linsvc'\n",
" clf = 'SVC()'\n",
" # 'kernel': ['rbf', 'linear', 'poly'], # default='rbf'\n",
" dict_param = {'kernel': ['rbf'], # default='rbf'\n",
" 'C': [0.001, 0.01, 0.1, 1.0], # default=1.0\n",
" 'gamma': [0.0001, 0.001, 0.01, 0.1, 'auto'], # default='auto'\n",
" 'tol': [0.00001, 0.0001, 0.001], # default=1e-3\n",
" 'probability': [True]} # default=False\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 8 in clf_select:\n",
" # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html\n",
" ref = 'knn'\n",
" clf = 'KNeighborsClassifier()'\n",
" dict_param = {'n_neighbors': [2, 3, 4, 5, 6], # default = 5\n",
" 'leaf_size': [10, 20, 30, 40, 50], # default = 30\n",
" 'n_jobs': [-1]} # default = 1\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" if 9 in clf_select:\n",
" # http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html\n",
" ref = 'mlp'\n",
" clf = 'MLPClassifier()'\n",
" dict_param = {'solver': ['lbfgs', 'sgd', 'adam'], # default 'adam'\n",
" 'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1], # default 0.0001\n",
" 'tol': [0.000001, 0.00001, 0.0001, 0.001]} # default=1e-4\n",
" list_ref.append((ref))\n",
" list_clf.append((clf))\n",
" list_param.append((dict_param))\n",
"\n",
" return list_ref, list_clf, list_param"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def eval_clf_list(list_clf, X, y_true, metric):\n",
" list_clfscore = []\n",
"\n",
" for clf in list_clf:\n",
" score = eval_clf(clf, X, y_true, 'f1')\n",
" list_clfscore.append((score))\n",
"\n",
" return list_clfscore"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def build_pipe(ref, clf, dict_param):\n",
" from sklearn.preprocessing import MinMaxScaler\n",
" from sklearn.preprocessing import StandardScaler\n",
" from sklearn.feature_selection import SelectKBest\n",
" from sklearn.naive_bayes import GaussianNB\n",
" from sklearn.tree import DecisionTreeClassifier\n",
" from sklearn.ensemble import AdaBoostClassifier\n",
" from sklearn.svm import SVC, LinearSVC\n",
" from sklearn.neighbors import KNeighborsClassifier\n",
" from sklearn.neural_network import MLPClassifier\n",
"\n",
" list_piperef = []\n",
" dict_pipeparam = {}\n",
"\n",
" list_piperef.append((ref, eval(clf)))\n",
"\n",
" for key, value in dict_param.items():\n",
" dict_pipeparam[ref + \"__\" + key] = value\n",
"\n",
" return list_piperef, dict_pipeparam"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def build_pipe_list(list_ref, list_clf, list_param):\n",
" import itertools\n",
"\n",
" list_piperefs = []\n",
" dict_pipeparams = {}\n",
"\n",
" for ref, clf, dict_param in zip(list_ref, list_clf, list_param):\n",
" list_piperef, dict_pipeparam = build_pipe(ref, clf, dict_param)\n",
"\n",
" list_piperefs.append((list_piperef[0][0], list_piperef[0][1]))\n",
" dict_pipeparams.update(dict_pipeparam)\n",
"\n",
" return list_piperefs, dict_pipeparams"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def execute_pipe(clf_select, X_all, X_test, y_true_all, y_true_test):\n",
" \"\" \"\"\n",
"\n",
" import timeit\n",
"\n",
" from sklearn.pipeline import Pipeline\n",
" #from sklearn.model_selection import StratifiedShuffleSplit\n",
" from sklearn.model_selection import GridSearchCV\n",
"\n",
" start = timeit.default_timer()\n",
"\n",
" list_ref, list_clf, list_param = build_clf_list(clf_select)\n",
" list_piperefs, dict_pipeparams = build_pipe_list(list_ref, list_clf, list_param)\n",
"\n",
" pipe = Pipeline(list_piperefs)\n",
" #cv = StratifiedShuffleSplit(y_true_all, test_size=0.3)\n",
" \n",
" np.random.seed(0)\n",
"\n",
" grid_search = GridSearchCV(pipe, dict_pipeparams, n_jobs=1, scoring='f1')\n",
" \n",
" grid_search.fit(X_all[:, 1:], y_true_all)\n",
"\n",
" stop = timeit.default_timer()\n",
"\n",
" time = (stop - start) / 60\n",
"\n",
" clf_best = grid_search.best_estimator_\n",
" # print(clf_best)\n",
"\n",
" param_best = grid_search.best_params_\n",
" # print(param_best)\n",
"\n",
" score_best = grid_search.best_score_\n",
" # print(score_best)\n",
"\n",
" f1score = eval_clf(clf_best, X_test, y_true_test, 'f1')\n",
" recall = eval_clf(clf_best, X_test, y_true_test, 'recall')\n",
" precision = eval_clf(clf_best, X_test, y_true_test, 'precision')\n",
" # print(\"Recall:\", recall, \"Precision:\", precision, \"F1 Score:\", f1score)\n",
"\n",
" # list_results = [list_clf[0], param_best, f1score, recall, precision, time]\n",
" list_results = [list_clf, param_best, f1score, recall, precision, time]\n",
"\n",
" return list_results"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"#X_all = df_motogpclustfeatures.values\n",
"X_all = df_motogpkbestswclustfeatures.values\n",
"\n",
"y_true_all = df_motogplabel['rresultWin'].values\n",
"#y_true_all = df_motogplabel['rresultPodium'].values\n",
"\n",
"X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, 0.25)\n",
"\n",
"clf_select = [[5], [6], [7], [8], [9]]\n",
"\n",
"df_motogpresults = pd.DataFrame(columns=['clf', 'param', 'f1', 'recall', 'precision', 'time'])\n",
"\n",
"for c in clf_select:\n",
" list_results = execute_pipe(c, X_all, X_test, y_true_all, y_true_test)\n",
" df_motogpresults.loc[len(df_motogpresults.index)] = list_results"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>param</th>\n",
" <th>f1</th>\n",
" <th>recall</th>\n",
" <th>precision</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[DecisionTreeClassifier()]</td>\n",
" <td>{'dt__criterion': 'gini', 'dt__max_depth': 6, ...</td>\n",
" <td>0.131148</td>\n",
" <td>0.071429</td>\n",
" <td>0.800000</td>\n",
" <td>0.161972</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[AdaBoostClassifier(DecisionTreeClassifier())]</td>\n",
" <td>{'dtb__base_estimator__max_depth': 2, 'dtb__le...</td>\n",
" <td>0.574468</td>\n",
" <td>0.482143</td>\n",
" <td>0.710526</td>\n",
" <td>26.477024</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[SVC()]</td>\n",
" <td>{'linsvc__C': 0.001, 'linsvc__gamma': 0.0001, ...</td>\n",
" <td>0.035088</td>\n",
" <td>0.017857</td>\n",
" <td>1.000000</td>\n",
" <td>54.810338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[KNeighborsClassifier()]</td>\n",
" <td>{'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn...</td>\n",
" <td>0.131148</td>\n",
" <td>0.071429</td>\n",
" <td>0.800000</td>\n",
" <td>0.558008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[MLPClassifier()]</td>\n",
" <td>{'mlp__alpha': 0.01, 'mlp__solver': 'adam', 'm...</td>\n",
" <td>0.294737</td>\n",
" <td>0.250000</td>\n",
" <td>0.358974</td>\n",
" <td>8.248912</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" clf \\\n",
"0 [DecisionTreeClassifier()] \n",
"1 [AdaBoostClassifier(DecisionTreeClassifier())] \n",
"2 [SVC()] \n",
"3 [KNeighborsClassifier()] \n",
"4 [MLPClassifier()] \n",
"\n",
" param f1 recall \\\n",
"0 {'dt__criterion': 'gini', 'dt__max_depth': 6, ... 0.131148 0.071429 \n",
"1 {'dtb__base_estimator__max_depth': 2, 'dtb__le... 0.574468 0.482143 \n",
"2 {'linsvc__C': 0.001, 'linsvc__gamma': 0.0001, ... 0.035088 0.017857 \n",
"3 {'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn... 0.131148 0.071429 \n",
"4 {'mlp__alpha': 0.01, 'mlp__solver': 'adam', 'm... 0.294737 0.250000 \n",
"\n",
" precision time \n",
"0 0.800000 0.161972 \n",
"1 0.710526 26.477024 \n",
"2 1.000000 54.810338 \n",
"3 0.800000 0.558008 \n",
"4 0.358974 8.248912 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_motogpresults"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"#X_all = df_motogpclustfeatures.values\n",
"X_all = df_motogpkbestswclustfeatures.values\n",
"\n",
"#y_true_all = df_motogplabel['rresultWin'].values\n",
"y_true_all = df_motogplabel['rresultPodium'].values\n",
"\n",
"X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, 0.25)\n",
"\n",
"clf_select = [[5], [6], [7], [8], [9]]\n",
"\n",
"df_motogpresults = pd.DataFrame(columns=['clf', 'param', 'f1', 'recall', 'precision', 'time'])\n",
"\n",
"for c in clf_select:\n",
" list_results = execute_pipe(c, X_all, X_test, y_true_all, y_true_test)\n",
" df_motogpresults.loc[len(df_motogpresults.index)] = list_results"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>param</th>\n",
" <th>f1</th>\n",
" <th>recall</th>\n",
" <th>precision</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[DecisionTreeClassifier()]</td>\n",
" <td>{'dt__criterion': 'gini', 'dt__max_depth': 7, ...</td>\n",
" <td>0.619217</td>\n",
" <td>0.491525</td>\n",
" <td>0.836538</td>\n",
" <td>0.152150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[AdaBoostClassifier(DecisionTreeClassifier())]</td>\n",
" <td>{'dtb__base_estimator__max_depth': 1, 'dtb__le...</td>\n",
" <td>0.552147</td>\n",
" <td>0.508475</td>\n",
" <td>0.604027</td>\n",
" <td>27.210125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[SVC()]</td>\n",
" <td>{'linsvc__C': 1.0, 'linsvc__gamma': 0.001, 'li...</td>\n",
" <td>0.508197</td>\n",
" <td>0.350282</td>\n",
" <td>0.925373</td>\n",
" <td>91.842384</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[KNeighborsClassifier()]</td>\n",
" <td>{'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn...</td>\n",
" <td>0.686275</td>\n",
" <td>0.593220</td>\n",
" <td>0.813953</td>\n",
" <td>0.254748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[MLPClassifier()]</td>\n",
" <td>{'mlp__alpha': 1e-05, 'mlp__solver': 'adam', '...</td>\n",
" <td>0.539792</td>\n",
" <td>0.440678</td>\n",
" <td>0.696429</td>\n",
" <td>3.805431</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" clf \\\n",
"0 [DecisionTreeClassifier()] \n",
"1 [AdaBoostClassifier(DecisionTreeClassifier())] \n",
"2 [SVC()] \n",
"3 [KNeighborsClassifier()] \n",
"4 [MLPClassifier()] \n",
"\n",
" param f1 recall \\\n",
"0 {'dt__criterion': 'gini', 'dt__max_depth': 7, ... 0.619217 0.491525 \n",
"1 {'dtb__base_estimator__max_depth': 1, 'dtb__le... 0.552147 0.508475 \n",
"2 {'linsvc__C': 1.0, 'linsvc__gamma': 0.001, 'li... 0.508197 0.350282 \n",
"3 {'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn... 0.686275 0.593220 \n",
"4 {'mlp__alpha': 1e-05, 'mlp__solver': 'adam', '... 0.539792 0.440678 \n",
"\n",
" precision time \n",
"0 0.836538 0.152150 \n",
"1 0.604027 27.210125 \n",
"2 0.925373 91.842384 \n",
"3 0.813953 0.254748 \n",
"4 0.696429 3.805431 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_motogpresults"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finish off with some benchmarking"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def benchmark(dict_motodata):\n",
" \"\"\n",
"\n",
" tp = 0\n",
" fp = 0\n",
" fn = 0\n",
"\n",
" df_motorresult = dict_motodata['rresult']\n",
"\n",
" list_motorresultsessid = df_motorresult[\"sessionId\"].unique()\n",
"\n",
" for i in range(1, len(list_motorresultsessid), 1):\n",
" currsessionId = list_motorresultsessid[i]\n",
" prevsessionId = list_motorresultsessid[i-1]\n",
"\n",
" pred = df_motorresult[(df_motorresult['sessionId'] == prevsessionId) & \\\n",
" (df_motorresult['rresultPlace'] == 1)]['riderId'].iloc[0]\n",
" \n",
" win = df_motorresult[(df_motorresult['sessionId'] == currsessionId) & \\\n",
" (df_motorresult['rresultPlace'] == 1)]['riderId'].iloc[0]\n",
" \n",
" if win == pred:\n",
" tp += 1\n",
" else:\n",
" fp += 1\n",
" fn += 1\n",
" \n",
" recall = float(tp) / (tp+fn)\n",
" precision = float(tp) / (tp+fp)\n",
" f1score = 2 * (precision * recall) / (precision + recall)\n",
" \n",
" return recall, precision, f1score"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"F1 Score: 0.3146551724137931\n"
]
}
],
"source": [
"recall, precision, f1score = benchmark(dict_motogpdata)\n",
"\n",
"print(\"F1 Score:\", f1score)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment