Created
April 14, 2020 00:02
-
-
Save buswedg/aebd1010edfa809d82e2ea7789eb0ebc to your computer and use it in GitHub Desktop.
predicting_motogp_winners_revisited\supervised_learning_revisited
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Predicting MotoGP winners (revisited)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Supervised Learning (revisited)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"#pd.options.mode.chained_assignment = None" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Reading in the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_motogpsession = pd.read_csv('data/motogpsession.tsv', sep='\\t', encoding='utf-8')\n", | |
"df_motogpqresult = pd.read_csv('data/motogpqresult.tsv', sep='\\t', encoding='utf-8')\n", | |
"df_motogprresult = pd.read_csv('data/motogprresult.tsv', sep='\\t', encoding='utf-8')\n", | |
"df_motogprider = pd.read_csv('data/motogprider.tsv', sep='\\t', encoding='utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_motogpsession = df_motogpsession.loc[:, ~df_motogpsession.columns.str.contains('^Unnamed')]\n", | |
"df_motogpqresult = df_motogpqresult.loc[:, ~df_motogpqresult.columns.str.contains('^Unnamed')]\n", | |
"df_motogprresult = df_motogprresult.loc[:, ~df_motogprresult.columns.str.contains('^Unnamed')]\n", | |
"df_motogprider = df_motogprider.loc[:, ~df_motogprider.columns.str.contains('^Unnamed')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dict_motogpdata = {}\n", | |
"\n", | |
"dict_motogpdata['session'] = df_motogpsession\n", | |
"dict_motogpdata['qresult'] = df_motogpqresult\n", | |
"dict_motogpdata['rresult'] = df_motogprresult\n", | |
"dict_motogpdata['rider'] = df_motogprider" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Generate our set of features and label" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def generate_labelfeat(dict_motodata):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" df_motosession = dict_motodata['session']\n", | |
" df_motoqresult = dict_motodata['qresult']\n", | |
" df_motorresult = dict_motodata['rresult']\n", | |
" df_motorider = dict_motodata['rider']\n", | |
"\n", | |
" # Create dictonary for session id to session type\n", | |
" dict_sessionidsession = df_motosession.set_index('sessionId')['sessionSession'].to_dict()\n", | |
" # print(dict_sessionidsession)\n", | |
"\n", | |
" # Create dictonary for session id to race session id\n", | |
" dict_sessionidracsessionid = {}\n", | |
" for index, row in df_motosession.iterrows():\n", | |
" sessionid = row['sessionId']\n", | |
" sessionseason = row['sessionSeason']\n", | |
" sessioncountry = row['sessionCountry']\n", | |
"\n", | |
" df_temp1 = df_motosession[(df_motosession['sessionSeason'] == sessionseason) & \\\n", | |
" (df_motosession['sessionCountry'] == sessioncountry) & \\\n", | |
" (df_motosession['sessionSession'] == 'RAC2')]\n", | |
" \n", | |
" df_temp2 = df_motosession[(df_motosession['sessionSeason'] == sessionseason) & \\\n", | |
" (df_motosession['sessionCountry'] == sessioncountry) & \\\n", | |
" (df_motosession['sessionSession'] == 'RAC')]\n", | |
"\n", | |
" if len(df_temp1) > 0:\n", | |
" dict_sessionidracsessionid[sessionid] = df_temp1['sessionId'].values[0]\n", | |
" \n", | |
" elif len(df_temp2) > 0:\n", | |
" dict_sessionidracsessionid[sessionid] = df_temp2['sessionId'].values[0]\n", | |
" \n", | |
" else:\n", | |
" # print(sessionid, sessionseason, sessioncountry)\n", | |
" dict_sessionidracsessionid[sessionid] = np.nan\n", | |
"\n", | |
" # Copy qualifying result dataframe\n", | |
" df_temp = df_motoqresult.copy()\n", | |
"\n", | |
" # Add session type to qresults\n", | |
" df_temp['sessionId2'] = df_temp['sessionId']\n", | |
" df_temp['sessionId2'] = df_temp['sessionId2'].replace(dict_sessionidsession)\n", | |
" df_temp = df_temp.rename(columns={'sessionId2': 'sessionSession'})\n", | |
"\n", | |
" # Add race session id\n", | |
" df_temp['sessionId3'] = df_temp['sessionId']\n", | |
" df_temp['sessionId3'] = df_temp['sessionId3'].replace(dict_sessionidracsessionid)\n", | |
" df_temp = df_temp.rename(columns={'sessionId3': 'racsessionId'})\n", | |
"\n", | |
" # Drop records which are missing race session id\n", | |
" df_temp = df_temp.dropna(subset=['racsessionId'])\n", | |
"\n", | |
" # Crete new race session id + rider id index\n", | |
" df_temp['racsessionriderId'] = df_temp['racsessionId'].map(str) + '_' + df_temp['riderId'].map(str)\n", | |
" df_temp = df_temp.drop(['sessionId', 'riderId', 'racsessionId'], 1)\n", | |
"\n", | |
" # Pivot table on new index and to new fields as needed\n", | |
" df_temp = df_temp.pivot(index='racsessionriderId', columns='sessionSession')\n", | |
" df_temp.columns = [str(x) + str(y) for x, y in list(df_temp.columns)]\n", | |
" df_temp = df_temp.reset_index()\n", | |
" df_ids = df_temp.racsessionriderId.str.split('_').apply(pd.Series)\n", | |
" df_ids.columns = ['racsessionId', 'riderId']\n", | |
" df_temp = pd.concat([df_temp, df_ids], axis=1)\n", | |
"\n", | |
" # Drop race session id + rider id index\n", | |
" df_temp = df_temp.drop('racsessionriderId', 1)\n", | |
" df_temp = df_temp.rename(columns={'racsessionId': 'sessionId'})\n", | |
" df_temp[['sessionId', 'riderId']] = df_temp[['sessionId', 'riderId']].astype(float)\n", | |
"\n", | |
" # Merge race result, rider and session data\n", | |
" df_temp = pd.merge(df_temp, df_motorresult, on=['riderId', 'sessionId'], how='left')\n", | |
" df_temp = pd.merge(df_temp, df_motorider, on='riderId', how='left')\n", | |
" df_temp = pd.merge(df_temp, df_motosession, on='sessionId', how='left')\n", | |
"\n", | |
" df_index = df_temp[['sessionId', 'sessionSeason', 'sessionCountry', 'riderId', 'riderName']].copy()\n", | |
"\n", | |
" # Extract sessionId\n", | |
" list_sessionId = df_temp['sessionId']\n", | |
"\n", | |
" # Drop unnecessary and non-feature fields\n", | |
" df_temp = df_temp.drop(['riderId',\n", | |
" 'riderNumber',\n", | |
" 'rresultTotaltime',\n", | |
" 'rresultAvgspeed',\n", | |
" 'sessionId',\n", | |
" 'sessionSeason',\n", | |
" 'sessionClass',\n", | |
" 'sessionCountry',\n", | |
" 'sessionSession',\n", | |
" 'sessionDate'], 1)\n", | |
"\n", | |
" # Insert sessionId back as first column in features dataframe\n", | |
" df_temp.insert(0, 'sessionId', list_sessionId)\n", | |
"\n", | |
" # Convert race win result to 0/1\n", | |
" df_temp['rresultWin'] = df_temp['rresultPlace']\n", | |
" df_temp['rresultWin'][df_temp['rresultWin'] > 1] = 0\n", | |
" df_temp['rresultWin'] = df_temp['rresultWin'].fillna(0)\n", | |
" \n", | |
" # Convert race podium result to 0/1\n", | |
" df_temp['rresultPodium'] = df_temp['rresultPlace']\n", | |
" df_temp['rresultPodium'][df_temp['rresultPodium'] <= 3] = 1\n", | |
" df_temp['rresultPodium'][df_temp['rresultPodium'] > 3] = 0\n", | |
" df_temp['rresultPodium'] = df_temp['rresultPodium'].fillna(0)\n", | |
"\n", | |
" # Convert top six result to 0/1\n", | |
" df_temp['rresultTopsix'] = df_temp['rresultPlace']\n", | |
" df_temp['rresultTopsix'][df_temp['rresultTopsix'] <= 6] = 1\n", | |
" df_temp['rresultTopsix'][df_temp['rresultTopsix'] > 6] = 0\n", | |
" df_temp['rresultTopsix'] = df_temp['rresultTopsix'].fillna(0)\n", | |
" \n", | |
" # Convert race performance to 1/2/3\n", | |
" df_temp['rresultPerformance'] = df_temp['rresultPlace']\n", | |
" df_temp['rresultPerformance'][df_temp['rresultPerformance'] <= 3] = 1\n", | |
" df_temp['rresultPerformance'][(df_temp['rresultPerformance'] > 3) & (df_temp['rresultPerformance'] <= 6)] = 2\n", | |
" df_temp['rresultPerformance'][df_temp['rresultPerformance'] > 6] = 3\n", | |
" df_temp['rresultPerformance'] = df_temp['rresultPerformance'].fillna(3)\n", | |
" \n", | |
" # Extract label and features dataframe\n", | |
" df_motogplabel = df_temp[['rresultPlace', 'rresultWin', 'rresultPodium', 'rresultTopsix', 'rresultPerformance']]\n", | |
" df_motogpfeatures = df_temp.drop(['rresultPlace', 'rresultWin', 'rresultPodium', 'rresultTopsix', 'rresultPerformance'], 1)\n", | |
"\n", | |
" return df_motogplabel, df_motogpfeatures" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:98: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:103: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:104: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:109: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:110: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:115: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:116: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
"C:\\Users\\buswedg\\Anaconda3\\envs\\Python37\\lib\\site-packages\\ipykernel_launcher.py:117: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" | |
] | |
} | |
], | |
"source": [ | |
"df_motogplabel, df_motogpfeatures = generate_labelfeat(dict_motogpdata)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"71\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(df_motogpfeatures.columns))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def convertdatetime(dt):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" import re\n", | |
" import numpy as np\n", | |
"\n", | |
" from datetime import datetime\n", | |
"\n", | |
" dt = str(dt)\n", | |
"\n", | |
" if dt == 'None':\n", | |
" return np.NaN\n", | |
"\n", | |
" else:\n", | |
" f = '\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{6}'\n", | |
" r = re.compile(f)\n", | |
" if r.match(dt) is None:\n", | |
" dt = dt + '.000000'\n", | |
"\n", | |
" try:\n", | |
" f = '%Y-%m-%d %H:%M:%S.%f'\n", | |
" a = datetime.strptime(dt, f)\n", | |
" b = datetime(1900, 1, 1)\n", | |
" except:\n", | |
" return np.NaN\n", | |
"\n", | |
" return (a - b).total_seconds()\n", | |
"\n", | |
" \n", | |
"def preprocess_features(df_in):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" import pandas as pd\n", | |
"\n", | |
" list_ignorecolumns = ['sessionId']\n", | |
"\n", | |
" list_timecolumns = ['qresultBesttimeFP',\n", | |
" 'qresultBesttimeFP1',\n", | |
" 'qresultBesttimeFP2',\n", | |
" 'qresultBesttimeFP3',\n", | |
" 'qresultBesttimeFP4',\n", | |
" 'qresultBesttimeQP',\n", | |
" 'qresultBesttimeQP1',\n", | |
" 'qresultBesttimeQP2',\n", | |
" 'qresultBesttimeQ1',\n", | |
" 'qresultBesttimeQ2',\n", | |
" 'qresultBesttimeWUP',\n", | |
" 'qresultBesttimeWUP2']\n", | |
"\n", | |
" df_out = pd.DataFrame(index=df_in.index)\n", | |
" \n", | |
" for col_name, col_values in df_in.iteritems():\n", | |
" if col_name in list_ignorecolumns:\n", | |
" col_values = col_values\n", | |
"\n", | |
" elif col_name in list_timecolumns:\n", | |
" col_values = col_values.astype(str)\n", | |
" col_values = col_values.apply(convertdatetime)\n", | |
"\n", | |
" elif col_values.dtype == object:\n", | |
" col_values = col_values.replace(['yes', 'no'], [1, 0])\n", | |
" col_values = pd.get_dummies(col_values, prefix=col_name)\n", | |
" df_out = df_out.join(col_values)\n", | |
"\n", | |
" df_out = df_out.fillna(0)\n", | |
"\n", | |
" return df_out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_motogpallfeatures = preprocess_features(df_motogpfeatures)\n", | |
"\n", | |
"df_motogpallnoidfeatures = df_motogpallfeatures.drop('sessionId', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4776\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(df_motogpallnoidfeatures))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create the final feature dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"list_motogpkbestfeatures = ['sessionId',\n", | |
" 'qresultPlaceFP',\n", | |
" 'qresultPlaceFP1',\n", | |
" 'qresultPlaceFP2',\n", | |
" 'qresultPlaceFP3',\n", | |
" 'qresultPlaceFP4',\n", | |
" 'qresultPlaceQ1',\n", | |
" 'qresultPlaceQ2',\n", | |
" 'qresultPlaceQP',\n", | |
" 'qresultPlaceQP1',\n", | |
" 'qresultPlaceQP2',\n", | |
" 'qresultPlaceWUP',\n", | |
" 'qresultBestlapFP',\n", | |
" 'qresultBestlapFP1',\n", | |
" 'qresultBestlapFP2',\n", | |
" 'qresultBestlapFP3',\n", | |
" 'qresultBestlapFP4',\n", | |
" 'qresultBestlapQ1',\n", | |
" 'qresultBestlapQ2',\n", | |
" 'qresultBestlapQP',\n", | |
" 'qresultBestlapQP1',\n", | |
" 'qresultBestlapQP2',\n", | |
" 'qresultBestlapWUP',\n", | |
" 'qresultTopspeedFP',\n", | |
" 'qresultTopspeedFP1',\n", | |
" 'qresultTopspeedFP2',\n", | |
" 'qresultTopspeedFP3',\n", | |
" 'qresultTopspeedFP4',\n", | |
" 'qresultTopspeedQ1',\n", | |
" 'qresultTopspeedQ2',\n", | |
" 'qresultTopspeedQP',\n", | |
" 'qresultTopspeedQP1',\n", | |
" 'qresultTopspeedQP2',\n", | |
" 'qresultTopspeedWUP',\n", | |
" 'qresultTotallapFP',\n", | |
" 'qresultTotallapFP1',\n", | |
" 'qresultTotallapFP2',\n", | |
" 'qresultTotallapFP3',\n", | |
" 'qresultTotallapFP4',\n", | |
" 'qresultTotallapQ1',\n", | |
" 'qresultTotallapQ2',\n", | |
" 'qresultTotallapQP',\n", | |
" 'qresultTotallapQP1',\n", | |
" 'qresultTotallapQP2',\n", | |
" 'qresultTotallapWUP',\n", | |
" 'qresultBesttimeFP',\n", | |
" 'qresultBesttimeFP1',\n", | |
" 'qresultBesttimeFP2',\n", | |
" 'qresultBesttimeFP3',\n", | |
" 'qresultBesttimeFP4',\n", | |
" 'qresultBesttimeQ1',\n", | |
" 'qresultBesttimeQ2',\n", | |
" 'qresultBesttimeQP',\n", | |
" 'qresultBesttimeQP1',\n", | |
" 'qresultBesttimeQP2',\n", | |
" 'qresultBesttimeWUP']\n", | |
"\n", | |
"df_motogpkbestfeatures = df_motogpallfeatures[list_motogpkbestfeatures]\n", | |
"\n", | |
"df_motogpkbestnoidfeatures = df_motogpkbestfeatures.drop('sessionId', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Apply standard and minmax scaling to features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.preprocessing import StandardScaler\n", | |
"from sklearn.preprocessing import MinMaxScaler\n", | |
"\n", | |
"scaler = StandardScaler()\n", | |
"\n", | |
"scaler.fit(df_motogpkbestnoidfeatures)\n", | |
"\n", | |
"df_motogpkbestnoidstdscaledfeatures = pd.DataFrame(scaler.transform(df_motogpkbestnoidfeatures), \n", | |
" columns=df_motogpkbestnoidfeatures.columns)\n", | |
"\n", | |
"\n", | |
"scaler = MinMaxScaler(feature_range=[0,100])\n", | |
"\n", | |
"scaler.fit(df_motogpkbestnoidfeatures)\n", | |
"\n", | |
"df_motogpkbestnoidmmscaledfeatures = pd.DataFrame(scaler.transform(df_motogpkbestnoidfeatures), \n", | |
" columns=df_motogpkbestnoidfeatures.columns)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Generate dimensionality reduced features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import itertools\n", | |
"\n", | |
"from sklearn.decomposition import PCA\n", | |
"from sklearn.decomposition import FastICA\n", | |
"from sklearn.random_projection import GaussianRandomProjection\n", | |
"\n", | |
"from sklearn.cluster import KMeans\n", | |
"\n", | |
"np.random.seed(0)\n", | |
"\n", | |
"ls_comp = [2, 3, 4]\n", | |
"ls_clust = [2, 3, 4]\n", | |
"\n", | |
"#df_motogpfeatures = df_motogpkbestnoidstdscaledfeatures\n", | |
"df_motogpfeatures = df_motogpkbestnoidmmscaledfeatures\n", | |
"\n", | |
"df_motogpclustfeatures = df_motogpkbestfeatures[['sessionId']].copy()\n", | |
"df_motogpkbestswclustfeatures = df_motogpkbestfeatures.copy()\n", | |
"\n", | |
"\n", | |
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n", | |
" pca = PCA(n_components=i, whiten=True).fit(df_motogpfeatures)\n", | |
"\n", | |
" df_reduced_data = pd.DataFrame(pca.transform(df_motogpfeatures))\n", | |
"\n", | |
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n", | |
" cluster_labels = clusterer.labels_\n", | |
"\n", | |
" df_motogpclustfeatures['PCA_' + str(i) + '_' + str(j)] = cluster_labels\n", | |
" df_motogpkbestswclustfeatures['PCA_' + str(i) + '_' + str(j)] = cluster_labels\n", | |
" \n", | |
" \n", | |
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n", | |
" ica = FastICA(n_components=i).fit(df_motogpfeatures)\n", | |
"\n", | |
" df_reduced_data = pd.DataFrame(pca.transform(df_motogpfeatures))\n", | |
"\n", | |
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n", | |
" cluster_labels = clusterer.labels_\n", | |
"\n", | |
" df_motogpclustfeatures['ICA_' + str(i) + '_' + str(j)] = cluster_labels\n", | |
" df_motogpkbestswclustfeatures['ICA_' + str(i) + '_' + str(j)] = cluster_labels\n", | |
"\n", | |
" \n", | |
"for i, j in list(itertools.product(ls_comp, ls_clust)):\n", | |
" rca = GaussianRandomProjection(n_components=i, random_state=10).fit(df_motogpfeatures)\n", | |
"\n", | |
" df_reduced_data = pd.DataFrame(rca.transform(df_motogpfeatures))\n", | |
"\n", | |
" clusterer = KMeans(n_clusters=j).fit(df_reduced_data)\n", | |
" cluster_labels = clusterer.labels_\n", | |
"\n", | |
" df_motogpclustfeatures['RP_' + str(i) + '_' + str(j)] = cluster_labels\n", | |
" df_motogpkbestswclustfeatures['RP_' + str(i) + '_' + str(j)] = cluster_labels" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create some helpers for our supervised learning routines" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def shuffle_split_data(y_true_all, X_all, test_size):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" from sklearn.model_selection import train_test_split\n", | |
"\n", | |
" X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=test_size)\n", | |
"\n", | |
" return X_train, y_true_train, X_test, y_true_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def eval_clf(clf, X, y_true, metric):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" y_pred = clf.predict_proba(X[:, 1:])\n", | |
" y_pred = convert_pred(X[:, 0], y_pred[:, 1])\n", | |
"\n", | |
" score = performance_metric(y_true, y_pred, metric)\n", | |
"\n", | |
" return score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def convert_pred(sessionId, y_pred_orig):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" import pandas as pd\n", | |
"\n", | |
" df_temp = pd.DataFrame({'sessionId': sessionId, 'y_pred_orig': y_pred_orig})\n", | |
"\n", | |
" df_temp['y_pred_adj'] = 0\n", | |
"\n", | |
" for s in df_temp['sessionId'].unique():\n", | |
" max_prob = df_temp[df_temp['sessionId'] == s]['y_pred_orig'].max()\n", | |
" if max_prob >= 0.5:\n", | |
" df_temp.loc[(df_temp['sessionId'] == s) & (df_temp['y_pred_orig'] == max_prob), 'y_pred_adj'] = 1\n", | |
"\n", | |
" y_pred_adj = df_temp['y_pred_adj'].values\n", | |
"\n", | |
" return y_pred_adj" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def performance_metric(y_true, y_pred, metric):\n", | |
" \"\"\" \"\"\"\n", | |
"\n", | |
" from sklearn.metrics import accuracy_score\n", | |
" from sklearn.metrics import f1_score\n", | |
" from sklearn.metrics import recall_score\n", | |
" from sklearn.metrics import precision_score\n", | |
"\n", | |
" if metric == 'accuracy':\n", | |
" score = accuracy_score(y_true, y_pred)\n", | |
" elif metric == 'f1':\n", | |
" score = f1_score(y_true, y_pred)\n", | |
" elif metric == 'recall':\n", | |
" score = recall_score(y_true, y_pred)\n", | |
" elif metric == 'precision':\n", | |
" score = precision_score(y_true, y_pred)\n", | |
"\n", | |
" return score" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Perform gridsearch cross validation optimization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def build_clf_list(clf_select):\n", | |
" list_ref = []\n", | |
" list_clf = []\n", | |
" list_param = []\n", | |
"\n", | |
" if 1 in clf_select:\n", | |
" ref = 'mmscale'\n", | |
" clf = 'MinMaxScaler()'\n", | |
" dict_param = {}\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 2 in clf_select:\n", | |
" ref = 'stdscale'\n", | |
" clf = 'StandardScaler()'\n", | |
" dict_param = {}\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 3 in clf_select:\n", | |
" ref = 'skb'\n", | |
" clf = 'SelectKBest()'\n", | |
" dict_param = {'k': [2, 4, 6, 8, 10, 12, 14, 16, 'all']}\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 4 in clf_select:\n", | |
" ref = 'naive'\n", | |
" clf = 'GaussianNB()'\n", | |
" dict_param = {}\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 5 in clf_select:\n", | |
" # http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html\n", | |
" ref = 'dt'\n", | |
" clf = 'DecisionTreeClassifier()'\n", | |
" dict_param = {'criterion': ['gini', 'entropy'], # default='gini'\n", | |
" 'splitter': ['random', 'best'], # default='best'\n", | |
" 'max_depth': [1, 2, 3, 4, 5, 6, 7], # default=None\n", | |
" 'max_features': ['auto', None]} # default=None\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 6 in clf_select:\n", | |
" # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html\n", | |
" ref = 'dtb'\n", | |
" clf = 'AdaBoostClassifier(DecisionTreeClassifier())'\n", | |
" dict_param = {'base_estimator__max_depth': [1, 2, 3, 4, 5, 6, 7], # default=None\n", | |
" 'n_estimators': [10, 15, 20, 25, 30, 35, 40], # default=50\n", | |
" 'learning_rate': [0.001, 0.01, 0.1, 1.0]} # default=1.\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 7 in clf_select:\n", | |
" # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html\n", | |
" ref = 'linsvc'\n", | |
" clf = 'SVC()'\n", | |
" # 'kernel': ['rbf', 'linear', 'poly'], # default='rbf'\n", | |
" dict_param = {'kernel': ['rbf'], # default='rbf'\n", | |
" 'C': [0.001, 0.01, 0.1, 1.0], # default=1.0\n", | |
" 'gamma': [0.0001, 0.001, 0.01, 0.1, 'auto'], # default='auto'\n", | |
" 'tol': [0.00001, 0.0001, 0.001], # default=1e-3\n", | |
" 'probability': [True]} # default=False\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 8 in clf_select:\n", | |
" # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html\n", | |
" ref = 'knn'\n", | |
" clf = 'KNeighborsClassifier()'\n", | |
" dict_param = {'n_neighbors': [2, 3, 4, 5, 6], # default = 5\n", | |
" 'leaf_size': [10, 20, 30, 40, 50], # default = 30\n", | |
" 'n_jobs': [-1]} # default = 1\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" if 9 in clf_select:\n", | |
" # http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html\n", | |
" ref = 'mlp'\n", | |
" clf = 'MLPClassifier()'\n", | |
" dict_param = {'solver': ['lbfgs', 'sgd', 'adam'], # default 'adam'\n", | |
" 'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1], # default 0.0001\n", | |
" 'tol': [0.000001, 0.00001, 0.0001, 0.001]} # default=1e-4\n", | |
" list_ref.append((ref))\n", | |
" list_clf.append((clf))\n", | |
" list_param.append((dict_param))\n", | |
"\n", | |
" return list_ref, list_clf, list_param" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def eval_clf_list(list_clf, X, y_true, metric):\n", | |
" list_clfscore = []\n", | |
"\n", | |
" for clf in list_clf:\n", | |
" score = eval_clf(clf, X, y_true, 'f1')\n", | |
" list_clfscore.append((score))\n", | |
"\n", | |
" return list_clfscore" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def build_pipe(ref, clf, dict_param):\n", | |
" from sklearn.preprocessing import MinMaxScaler\n", | |
" from sklearn.preprocessing import StandardScaler\n", | |
" from sklearn.feature_selection import SelectKBest\n", | |
" from sklearn.naive_bayes import GaussianNB\n", | |
" from sklearn.tree import DecisionTreeClassifier\n", | |
" from sklearn.ensemble import AdaBoostClassifier\n", | |
" from sklearn.svm import SVC, LinearSVC\n", | |
" from sklearn.neighbors import KNeighborsClassifier\n", | |
" from sklearn.neural_network import MLPClassifier\n", | |
"\n", | |
" list_piperef = []\n", | |
" dict_pipeparam = {}\n", | |
"\n", | |
" list_piperef.append((ref, eval(clf)))\n", | |
"\n", | |
" for key, value in dict_param.items():\n", | |
" dict_pipeparam[ref + \"__\" + key] = value\n", | |
"\n", | |
" return list_piperef, dict_pipeparam" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def build_pipe_list(list_ref, list_clf, list_param):\n", | |
" import itertools\n", | |
"\n", | |
" list_piperefs = []\n", | |
" dict_pipeparams = {}\n", | |
"\n", | |
" for ref, clf, dict_param in zip(list_ref, list_clf, list_param):\n", | |
" list_piperef, dict_pipeparam = build_pipe(ref, clf, dict_param)\n", | |
"\n", | |
" list_piperefs.append((list_piperef[0][0], list_piperef[0][1]))\n", | |
" dict_pipeparams.update(dict_pipeparam)\n", | |
"\n", | |
" return list_piperefs, dict_pipeparams" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def execute_pipe(clf_select, X_all, X_test, y_true_all, y_true_test):\n", | |
" \"\" \"\"\n", | |
"\n", | |
" import timeit\n", | |
"\n", | |
" from sklearn.pipeline import Pipeline\n", | |
" #from sklearn.model_selection import StratifiedShuffleSplit\n", | |
" from sklearn.model_selection import GridSearchCV\n", | |
"\n", | |
" start = timeit.default_timer()\n", | |
"\n", | |
" list_ref, list_clf, list_param = build_clf_list(clf_select)\n", | |
" list_piperefs, dict_pipeparams = build_pipe_list(list_ref, list_clf, list_param)\n", | |
"\n", | |
" pipe = Pipeline(list_piperefs)\n", | |
" #cv = StratifiedShuffleSplit(y_true_all, test_size=0.3)\n", | |
" \n", | |
" np.random.seed(0)\n", | |
"\n", | |
" grid_search = GridSearchCV(pipe, dict_pipeparams, n_jobs=1, scoring='f1')\n", | |
" \n", | |
" grid_search.fit(X_all[:, 1:], y_true_all)\n", | |
"\n", | |
" stop = timeit.default_timer()\n", | |
"\n", | |
" time = (stop - start) / 60\n", | |
"\n", | |
" clf_best = grid_search.best_estimator_\n", | |
" # print(clf_best)\n", | |
"\n", | |
" param_best = grid_search.best_params_\n", | |
" # print(param_best)\n", | |
"\n", | |
" score_best = grid_search.best_score_\n", | |
" # print(score_best)\n", | |
"\n", | |
" f1score = eval_clf(clf_best, X_test, y_true_test, 'f1')\n", | |
" recall = eval_clf(clf_best, X_test, y_true_test, 'recall')\n", | |
" precision = eval_clf(clf_best, X_test, y_true_test, 'precision')\n", | |
" # print(\"Recall:\", recall, \"Precision:\", precision, \"F1 Score:\", f1score)\n", | |
"\n", | |
" # list_results = [list_clf[0], param_best, f1score, recall, precision, time]\n", | |
" list_results = [list_clf, param_best, f1score, recall, precision, time]\n", | |
"\n", | |
" return list_results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#X_all = df_motogpclustfeatures.values\n", | |
"X_all = df_motogpkbestswclustfeatures.values\n", | |
"\n", | |
"y_true_all = df_motogplabel['rresultWin'].values\n", | |
"#y_true_all = df_motogplabel['rresultPodium'].values\n", | |
"\n", | |
"X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, 0.25)\n", | |
"\n", | |
"clf_select = [[5], [6], [7], [8], [9]]\n", | |
"\n", | |
"df_motogpresults = pd.DataFrame(columns=['clf', 'param', 'f1', 'recall', 'precision', 'time'])\n", | |
"\n", | |
"for c in clf_select:\n", | |
" list_results = execute_pipe(c, X_all, X_test, y_true_all, y_true_test)\n", | |
" df_motogpresults.loc[len(df_motogpresults.index)] = list_results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>clf</th>\n", | |
" <th>param</th>\n", | |
" <th>f1</th>\n", | |
" <th>recall</th>\n", | |
" <th>precision</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>[DecisionTreeClassifier()]</td>\n", | |
" <td>{'dt__criterion': 'gini', 'dt__max_depth': 6, ...</td>\n", | |
" <td>0.131148</td>\n", | |
" <td>0.071429</td>\n", | |
" <td>0.800000</td>\n", | |
" <td>0.161972</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>[AdaBoostClassifier(DecisionTreeClassifier())]</td>\n", | |
" <td>{'dtb__base_estimator__max_depth': 2, 'dtb__le...</td>\n", | |
" <td>0.574468</td>\n", | |
" <td>0.482143</td>\n", | |
" <td>0.710526</td>\n", | |
" <td>26.477024</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>[SVC()]</td>\n", | |
" <td>{'linsvc__C': 0.001, 'linsvc__gamma': 0.0001, ...</td>\n", | |
" <td>0.035088</td>\n", | |
" <td>0.017857</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>54.810338</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>[KNeighborsClassifier()]</td>\n", | |
" <td>{'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn...</td>\n", | |
" <td>0.131148</td>\n", | |
" <td>0.071429</td>\n", | |
" <td>0.800000</td>\n", | |
" <td>0.558008</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>[MLPClassifier()]</td>\n", | |
" <td>{'mlp__alpha': 0.01, 'mlp__solver': 'adam', 'm...</td>\n", | |
" <td>0.294737</td>\n", | |
" <td>0.250000</td>\n", | |
" <td>0.358974</td>\n", | |
" <td>8.248912</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" clf \\\n", | |
"0 [DecisionTreeClassifier()] \n", | |
"1 [AdaBoostClassifier(DecisionTreeClassifier())] \n", | |
"2 [SVC()] \n", | |
"3 [KNeighborsClassifier()] \n", | |
"4 [MLPClassifier()] \n", | |
"\n", | |
" param f1 recall \\\n", | |
"0 {'dt__criterion': 'gini', 'dt__max_depth': 6, ... 0.131148 0.071429 \n", | |
"1 {'dtb__base_estimator__max_depth': 2, 'dtb__le... 0.574468 0.482143 \n", | |
"2 {'linsvc__C': 0.001, 'linsvc__gamma': 0.0001, ... 0.035088 0.017857 \n", | |
"3 {'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn... 0.131148 0.071429 \n", | |
"4 {'mlp__alpha': 0.01, 'mlp__solver': 'adam', 'm... 0.294737 0.250000 \n", | |
"\n", | |
" precision time \n", | |
"0 0.800000 0.161972 \n", | |
"1 0.710526 26.477024 \n", | |
"2 1.000000 54.810338 \n", | |
"3 0.800000 0.558008 \n", | |
"4 0.358974 8.248912 " | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_motogpresults" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#X_all = df_motogpclustfeatures.values\n", | |
"X_all = df_motogpkbestswclustfeatures.values\n", | |
"\n", | |
"#y_true_all = df_motogplabel['rresultWin'].values\n", | |
"y_true_all = df_motogplabel['rresultPodium'].values\n", | |
"\n", | |
"X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, 0.25)\n", | |
"\n", | |
"clf_select = [[5], [6], [7], [8], [9]]\n", | |
"\n", | |
"df_motogpresults = pd.DataFrame(columns=['clf', 'param', 'f1', 'recall', 'precision', 'time'])\n", | |
"\n", | |
"for c in clf_select:\n", | |
" list_results = execute_pipe(c, X_all, X_test, y_true_all, y_true_test)\n", | |
" df_motogpresults.loc[len(df_motogpresults.index)] = list_results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>clf</th>\n", | |
" <th>param</th>\n", | |
" <th>f1</th>\n", | |
" <th>recall</th>\n", | |
" <th>precision</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>[DecisionTreeClassifier()]</td>\n", | |
" <td>{'dt__criterion': 'gini', 'dt__max_depth': 7, ...</td>\n", | |
" <td>0.619217</td>\n", | |
" <td>0.491525</td>\n", | |
" <td>0.836538</td>\n", | |
" <td>0.152150</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>[AdaBoostClassifier(DecisionTreeClassifier())]</td>\n", | |
" <td>{'dtb__base_estimator__max_depth': 1, 'dtb__le...</td>\n", | |
" <td>0.552147</td>\n", | |
" <td>0.508475</td>\n", | |
" <td>0.604027</td>\n", | |
" <td>27.210125</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>[SVC()]</td>\n", | |
" <td>{'linsvc__C': 1.0, 'linsvc__gamma': 0.001, 'li...</td>\n", | |
" <td>0.508197</td>\n", | |
" <td>0.350282</td>\n", | |
" <td>0.925373</td>\n", | |
" <td>91.842384</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>[KNeighborsClassifier()]</td>\n", | |
" <td>{'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn...</td>\n", | |
" <td>0.686275</td>\n", | |
" <td>0.593220</td>\n", | |
" <td>0.813953</td>\n", | |
" <td>0.254748</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>[MLPClassifier()]</td>\n", | |
" <td>{'mlp__alpha': 1e-05, 'mlp__solver': 'adam', '...</td>\n", | |
" <td>0.539792</td>\n", | |
" <td>0.440678</td>\n", | |
" <td>0.696429</td>\n", | |
" <td>3.805431</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" clf \\\n", | |
"0 [DecisionTreeClassifier()] \n", | |
"1 [AdaBoostClassifier(DecisionTreeClassifier())] \n", | |
"2 [SVC()] \n", | |
"3 [KNeighborsClassifier()] \n", | |
"4 [MLPClassifier()] \n", | |
"\n", | |
" param f1 recall \\\n", | |
"0 {'dt__criterion': 'gini', 'dt__max_depth': 7, ... 0.619217 0.491525 \n", | |
"1 {'dtb__base_estimator__max_depth': 1, 'dtb__le... 0.552147 0.508475 \n", | |
"2 {'linsvc__C': 1.0, 'linsvc__gamma': 0.001, 'li... 0.508197 0.350282 \n", | |
"3 {'knn__leaf_size': 10, 'knn__n_jobs': -1, 'knn... 0.686275 0.593220 \n", | |
"4 {'mlp__alpha': 1e-05, 'mlp__solver': 'adam', '... 0.539792 0.440678 \n", | |
"\n", | |
" precision time \n", | |
"0 0.836538 0.152150 \n", | |
"1 0.604027 27.210125 \n", | |
"2 0.925373 91.842384 \n", | |
"3 0.813953 0.254748 \n", | |
"4 0.696429 3.805431 " | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_motogpresults" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Finish off with some benchmarking" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def benchmark(dict_motodata):\n", | |
" \"\"\n", | |
"\n", | |
" tp = 0\n", | |
" fp = 0\n", | |
" fn = 0\n", | |
"\n", | |
" df_motorresult = dict_motodata['rresult']\n", | |
"\n", | |
" list_motorresultsessid = df_motorresult[\"sessionId\"].unique()\n", | |
"\n", | |
" for i in range(1, len(list_motorresultsessid), 1):\n", | |
" currsessionId = list_motorresultsessid[i]\n", | |
" prevsessionId = list_motorresultsessid[i-1]\n", | |
"\n", | |
" pred = df_motorresult[(df_motorresult['sessionId'] == prevsessionId) & \\\n", | |
" (df_motorresult['rresultPlace'] == 1)]['riderId'].iloc[0]\n", | |
" \n", | |
" win = df_motorresult[(df_motorresult['sessionId'] == currsessionId) & \\\n", | |
" (df_motorresult['rresultPlace'] == 1)]['riderId'].iloc[0]\n", | |
" \n", | |
" if win == pred:\n", | |
" tp += 1\n", | |
" else:\n", | |
" fp += 1\n", | |
" fn += 1\n", | |
" \n", | |
" recall = float(tp) / (tp+fn)\n", | |
" precision = float(tp) / (tp+fp)\n", | |
" f1score = 2 * (precision * recall) / (precision + recall)\n", | |
" \n", | |
" return recall, precision, f1score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"F1 Score: 0.3146551724137931\n" | |
] | |
} | |
], | |
"source": [ | |
"recall, precision, f1score = benchmark(dict_motogpdata)\n", | |
"\n", | |
"print(\"F1 Score:\", f1score)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment