Created
May 3, 2018 01:49
-
-
Save tuf22191/1df97a9ed707065591d881c8665e259b to your computer and use it in GitHub Desktop.
K Nearest Neighbors Using Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Hello World! Hello AI!\n" | |
] | |
} | |
], | |
"source": [ | |
"# we import the libraries we need \n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt \n", | |
"\n", | |
"\n", | |
"print(\"Hello World! Hello AI!\")\n", | |
"#a helper function to stack a datapoint (1 row of a pandas dataframe) several times and add 2 Pandas dataframes\n", | |
"def stackDataPoint(numberOfStacks, datapoint_df):\n", | |
" stacks = [datapoint_df for x in range(0, numberOfStacks)]\n", | |
" result = pd.concat(stacks, ignore_index=True)\n", | |
" return result \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 134, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJztnX+MZtV53z/PzO6AB1zBvmxcCuws\nRMiNU0UYphTq1FLtuDarqjiqmxJN8Aql2nogUmylVeyO1OJKK0VRfshWYty1TbzxTG0Tx5VRs62L\nqKskkoM11OsFQhCLzS4LG2BNAWMsG3ZP/7j3Zd959/445/6+9/1+pKv3nfPeH+fcu/s95z7Pc55j\nzjmEEEIMl7m2KyCEEKJeJPRCCDFwJPRCCDFwJPRCCDFwJPRCCDFwJPRCCDFwJPRCCDFwJPRCCDFw\nJPRCCDFwtrVdAYBLLrnE7d69u+1qCCFEr3jwwQdPOed25u3XCaHfvXs3m5ubbVdDCCF6hZkd89lP\nphshhBg4EnohhBg4EnohhBg4EnohhBg4EnohhBg4EnohhPBgYwN274a5uehzY6PtGvkjoRdigPRZ\nlLrIxgbs2wfHjoFz0ee+ff25rxJ6IQZG30Wpi6ytwauvbi179dWovA9I6IUYGH0XpS5y/HhYedeQ\n0AsxMPouSl1k166w8q4hoRdiYPRdlLrI/v2wuLi1bHExKu8DEnohBkbfRSmNNh3MKytw4AAsLYFZ\n9HngQFTeByT0QgyMLopSWZHugoN5ZQWefBLOnIk++yLyAOaca7sOLC8vO2WvFGKYjEV60kG8uBjW\n+ezeHYn7NEtLkejOKmb2oHNuOW8/jeiFELVSRRSQHMzlkNALIWqlCpFu0sE8xMlmEnohGmCI4uFL\nFSJdxMFc5J53wRdQC8651rfrrrvOCTFU1tedW1x0LpKOaFtcjMq7wvq6c0tLzplFn1XWrar2h9Sx\n6DWXlrYeM96WlsLq2hTApvPQWDljhaiZrjsSq3CW+lxjbS0y1+zaFY3E64xaKXrP5+YiaZ/GLIq2\n6RqVOWPN7Hwz+5aZfcfMHjGzj8flnzez75nZ4Xi7Ji43M/ukmR01syNmdm355gjRX5pwJJYxDdWV\nMmGyTmtrkbg3FZpY9J4PdrJZ3pAfMODC+Pt24AHgBuDzwAcS9t8D/I/4uBuAB/KuIdONGDJ1mwPK\nmkbMkutn1l6dylL0nrdd71DwNN3kjujj870S/7k93rLsPTcDfxwf91fARWZ2aXAPJMRAqHumatkR\neR2j2LYTqxW9512cbFYFXlE3ZjZvZoeB54D7nHMPxD/tj80zv29m58VllwFPTRx+Ii4TYiapWzzK\nmobq6Ijajnsvc8/7PAM2DS+hd86dds5dA1wOXG9m/wD4GPD3gX8I7AB+M97dkk4xXWBm+8xs08w2\nn3/++UKVF6Iv1CkeZUfkdXREXbB113XP+xgqGxRH75x7Efg/wPuccydj88yPgT8Cro93OwFcMXHY\n5cAzCec64Jxbds4t79y5s1DlhRDVjMirFsUhJ1brY5y9T9TNTjO7KP7+JuAXgL8Z293NzID3Aw/H\nh9wLfDCOvrkBeMk5d7KW2gshOmlX7mKdqqBt30NRcuPozezngIPAPFHHcI9z7j+b2f8GdhKZag4D\nH3LOvRIL/x8A7wNeBW5zzmUGySuOXgjRNj6x/l2Ls/eNo9+Wt4Nz7gjw9oTyd6Xs74A7fCophBBF\nqHoC1vSksbFJBraed9eu5IlYXY+zV64bIUSvqMNO7muS6avvQUIvhOgVddjJfcNB03wP0O1IHAm9\nEANnOhzw9tvbF6UyIYp1xOiHhINORyhBDyJxfKbP1r0pBYIQ9ZA0pT8p1cE4PUATU/3LphmoI6VE\nmTq1mfGSqlIgCCH6S5KZY5pxFMmxY3DbbXDJJfWO9suaXuqwk5cJB217FrAPSlMsxIBJCwf0pep0\nxVl1CglRbDrtcRZtpqHWmrFCzBhJdu+yYX91TAYqmh6hqrTHVacw6EUkjo99p+5NNnohwplccWk0\ncm779nNtzKur+Tb6vK1MuuK0eofaw6tcpaqONMR1rtCVBZ42+tZF3knohQjGx8k66WCdFKHV1bMO\nxLRc9HU7FUOFsSqHZ9+WCszDV+hloxeih6TZhafJs3tP2rp37IAf/AB+8pOzv9dhoy9CGbv+ZBvT\n5K6rSwXmIRu9EAPGN6Ijz+49GRN+6hTcfXdzichCbOVl7PqTMe6h5x8KEnoheoiPMBVxCCZNBsoT\n4yLOzY2NKJRzcpLRbbelH1vE4bmxAXv35oeXds5xWgc+9p26N9nohQhjfd25hYWtduZt2yKnbJ7d\n29c+7uO4LOrcHI2SbeWjUXabfe36vhPFijhO23K8JoGcsUIMl/X1c6Nstm/PF50QYfZxXObtkyaK\nWQJcBWn1Kut87dri4RJ6IQZM0eiRkOPSInImwy2z9skSRR+hLzNyzosmWl31P1fR+9cEvkIvG70Q\nPSTNGXvsWLadPC1SJyTH+mR51j5pqQ727oULLkg+bjSKPsumIs7zYRw65HeeafqQ7iAJCb0QPSRL\nyLJEcX4++Zikch8HaNY+aZ3K6dPw2mvnXnNhAT7xieh7HflwJhkLc6gjuQuLnhfCZ9hf9ybTjRBh\n+Dgbk8wJobZxH/NJ2j7z89nXG43Sz+1jNvK5R2l1GF+vrRm6VYFs9EJ0k6qiNsbnyYoqmb5elvBV\nTZbI54l2VbbwLGEueg1F3UjohcikjhFhlmD5jPx9rl9E3MpEvlR5n9LqnuWwreL8TSChF6KDVBm1\nMTkqnRatcUKztBH8/Ly/MIWIbl6itbo7lxDSnsU4YsinTklJ45o05UjoheggoTbyNJLEd3KlqLys\nlSG2bt/OKalOCwtnJ0eNO522zR1j1tfTR/VJHW/WPW/CFJaEr9ArqZkQDbJtWxR1Ms38PLz+uv95\n8ha7yEt6FrIohm9CsTYX4CiKWXr5dJIz30RyacfXgZKaCdFBkkQ+qzyNvHjurLju0NwuviGFfYwx\nX1pKLk9qc0g7uhZuKaEXokHShCWtPI088U37fX4+PCOlb0KxPsaYhyRLS2vH9FtBJ5Ok+dh36t5k\noxezQlMrJYXkwikTK19H24pQxnFbJsnb2Ebflv+BqpyxwPnAt4DvAI8AH4/LrwQeAB4HvgwsxOXn\nxX8fjX/fnXcNCb2YJaqOo086T1J2y4WFc68VKs55da+ibaHnaLKDyYt0atrJXKXQG3Bh/H17LN43\nAPcAt8TlnwZW4++3A5+Ov98CfDnvGhJ6IarFN1ImJNyzCUEtco02Eo11JbmZr9AHRd2Y2SLwl8Aq\n8GfA33XOvW5mNwJ3Oufea2Zfj79/08y2AX8L7HQZF1LUjRDVkhcpM15eLy2KJCTqpMqomiLXKLPM\nYFHauGYSlUbdmNm8mR0GngPuA54AXnTOjQPCTgCXxd8vA54CiH9/CRiFVV+I2SArqVaRlZvGZDlG\nJzNDpjE3d+71moiqKXKNNpzAvXM8+wz7xxtwEfAN4J8ARyfKrwAeir8/Alw+8dsTwCjhXPuATWBz\n165ddb7dCNFJsswUZc0kRXK8TG/bt29dsSptVShfc4WP7T3UlFTUXl7Wl9CV5GbUNTMW+E/AvwdO\nAdvishuBr8ffvw7cGH/fFu9nWeeUjV4MCV8RyRK1kNmoWQ7Z0BwvecI/7eD1FTdfYSyz3+TM4CIZ\nKFdXw53AbSc3q0zogZ3ARfH3NwF/Afxz4E/Y6oy9Pf5+B1udsffkXUNCL4ZCyEgvKxWvT5reoqNK\n3xF90paVWjhL+IqM1Ksa+U+fN61tWW8FXRD1JKoU+p8Dvg0cAR4G/mNcfhVR2OXRWPTPi8vPj/8+\nGv9+Vd41JPRiKIQIUNkRfZk0u745W9LOPy14eZ1OFfnlJwk9n08Wz7S2dsVMk0Rtpps6Ngm9GAoh\nAlTWRl9GPH2yMIaMevM6nbxUylWlQE7r5Iq+xYzrVKRDbQIJvRAtECoKRWzsRa+Vx/p69qpQRWz7\nk4ufpNnFi4yWfWYGT9670A5s8l5W/TZSJRJ6IVqgitf8MlPyy5gU1tfDhTxvmxyxw7mpAsp0Vmn3\nKdQ0lZdXXiN6Cb0Q51A270qVKQnKXHda1ELNHz4j9jpGy2n1zAvBDOk4qrDRV/HsJPRC9JC2Ro9Z\nIp7lN0gTVN8Rex3tzRu915n4LOR8VXQeEnohekhb9uAscUwa9WYJqW971teTJ2HVuYbudDvaCpes\nqoOT0AtRgraEoI0RfZYTNsuJ7DMiTWvPaJT8djA3F5l7yrYnz1Eb4sit49lX1aFL6IUoSJtx001f\nO8scU0UagbT2pKVTqKq9RR3ATd1/jeiFaJm2oyyafJtIa+v8fHXXTWpPXqhmFfc6TbTTrtlkzHzT\nNnotDi7EFF1JQdsEaYtjQ/I9qIq8hbaruNchi3lDlAr5+PHmnv04VfTx41HWy/37w5Z4jOqlxcGF\nKETvUtCWYH4+rLwqktZqnaSKex2SPnm8zuuOHfXVZ5qVlSjH/pkz0WeoyIcgoRdiipAFo/vO6dNh\n5VWxshItUj5KWKmiqnudJ87z89FIfWkpqgvAyy+fu9/CQv+fvYReiCnGIrS0tFUI6hxxtcXSUlh5\nlayswKlTsL5ez73Oe2s4c2braHptDV577dz93vzm/j972eiFmGHGq029+urZssXF4XRsGxuwd2/y\nG8r08oR99M3IRi+EyGXoby8rK3DwoJ8pbsi+GQm9EDNOk07BqvFZVzerM5s8/pVXYPv2rccOxTcj\n040QopeUNTslHb+wENnkX3iheMhjk/iabiT0QohekhYnP217r+v4LiAbvRBi0KTFyfvGz5c9vk9I\n6IUQleBjL6+Sss7TITtfp5HQCyG2UESwx/buY8eiEMVjx6K/6xT7PXvCyqeZpYlxEnohxBsUFey1\nta1OTYj+Xlurr66HDoWVTzP00NJJ5IwVQrxBUQdlG5ON+jjBqWrkjBVCBFPUQdmGvXuWbOxlkdAL\nId4gTzzT7Pdt2LtnycZeFgm9EOINssQzy37fhr17lmzsZZGNXgixhekFMfbsiRycaYt4lJlgVMXi\nG7NMZTZ6M7vCzL5hZo+a2SNm9utx+Z1m9rSZHY63PRPHfMzMjprZY2b23nJNEUI0yWTum/37o6Rg\nWSs1FZ1g1EZI5qziY7p5HfgN59zPADcAd5jZ2+Lfft85d028HQKIf7sF+FngfcCnzKzm9WqEaJ6m\nJwi1QVLY5DRFnZ9thGQWpe/PelveDs65k8DJ+PsPzOxR4LKMQ24GvuSc+zHwPTM7ClwPfLOC+grR\nCaYTYo1HozAs00PeaL2M8zPtLaFrKQiG8KyDnLFmtht4O/BAXPRrZnbEzO42s4vjssuApyYOO0F2\nxyBE7+jTaLQMWaP1Ms7PjY30hcm7Fh45hGftLfRmdiHwp8CHnXMvA3cBPw1cQzTi/93xrgmHn+Px\nNbN9ZrZpZpvPP/98cMVFQfr+DtoRZiUhVloUzvp6udz1a2vpk526Fh45hGftJfRmtp1I5Decc18F\ncM4965w77Zw7A3yGyDwD0Qj+ionDLweemT6nc+6Ac27ZObe8c+fOMm0Qvsj7VRmzMlmnrhDGNJF0\nrnvmkCE8a5+oGwM+BzzqnPu9ifJLJ3b7ReDh+Pu9wC1mdp6ZXQlcDXyruiqLwgzhHbQjzNJknTpW\noEoTySYWJQ9lCM/aZ0T/DuBW4F1ToZS/bWYPmdkR4J8CHwFwzj0C3AP8NfA/gTuccwlL84rGGcI7\naEfQZJ1y9Ek8h/CsNWFqlhjCkjpiMGiyVHmU1EycS5+GUWLw9HlR8r4hoe8rRaJnhvAOKoQIJnfC\nlOggZWZwrKxI2IWYMTSi7yOKnhFCBCCh7yOKnhFCBCCh7yNDmMEhhGgMCX0fKRs9ozQIQswUEvo+\nUiZ6RmkQhJg5NGFq1tCkKSEGgyZMiWTkyBVi5pDQ951Qe7scuULMHBL6PlPE3q40CELMHBL6PlNk\n4pTSIAgxc8gZ22fm5tKX6Tlzpvn6CCEaRc7YWUD2diGEBxL6PiN7uxDCAwl9n2nb3q4ZtkL0AqUp\n7jttpR0ukypZCNEoGtGLYihVshC9QUIviqEZtkL0Bgm9KIYifoToDRJ6UQxF/AjRGyT0ohhVRPwo\nakeIRlDUjShOmYgfRe0I0Rga0Yt2UNSOEI0hoRftoKgdIRojV+jN7Aoz+4aZPWpmj5jZr8flO8zs\nPjN7PP68OC43M/ukmR01syNmdm3djRA9RFE7QjSGz4j+deA3nHM/A9wA3GFmbwM+CtzvnLsauD/+\nG+Am4Op42wfcVXmtZ4UhOyuLRu0M+Z4IURfOuaAN+BrwHuAx4NK47FLgsfj7fwF+eWL/N/ZL2667\n7jonplhfd25x0bkoEXG0LS5G5UNhfd25pSXnzKLPvLbNwj0RIgBg03nodpCN3sx2A28HHgDe4pw7\nGXcWJ4Gfine7DHhq4rATcdlsUNWIcxaclSsr0YLkZ85En3nRNrNwT4SoAe/wSjO7EPhT4MPOuZfN\nLHXXhLJzVscws31Eph12DcUuW2XIoJyV56J7IkQhvEb0ZradSOQ3nHNfjYufNbNL498vBZ6Ly08A\nV0wcfjnwzPQ5nXMHnHPLzrnlnTt3Fq1/t6hyxDk0Z2UVbzpDuydCNIRP1I0BnwMedc793sRP9wJ7\n4+97iWz34/IPxtE3NwAvjU08g6fKEWedKQaadmgWWcQ8CaVdEKIYeUZ84OeJTC9HgMPxtgcYEUXb\nPB5/7oj3N+APgSeAh4DlvGsMxhm7tLTVUTjelpaKnS/UWel7zqYdmmn3ZX4+/Lp13JM2riFEBeDp\njA2OuqljG4zQNymiRcWo6s7IB7PkazbRyayuRh3KuGNZXc3eX5E9okdI6NuiqRFnUTFKE12z6us5\nJq1zyepkqriPq6vJ18sS+zY6QiEKIqEfMmXEqAohqyL+PauTqWpUPR7JJ5mM0mijIxSiIL5Cr1w3\nfaSM07esQ7OIY3Wc0nh+Pvn36aiZqqKXTp8OK0+qS165ED1AQt9HyohR2TzyRUR4YyP6/fTp6JqT\nTHYy42igY8eSzxMavZTWsaSVgyJ7xDDxGfbXvcl0E0ibDsNQ00ZSXcfnmDT7rK87t317unlnfFxI\nG4vY6Md1yTJNKSpHdARkox84bYlNqI3fd//RKFvkJzu01VX/todG3eShqBzRIST0s0STou87Qh/j\n+wbgI/LT12tDaBWVIzqEr9DLRt93qpp1mnbu6Rm0kzZ+iGzuzkXfk65dh3NzfL0xTSY2U74d0UMk\n9H2nroyOWR3IOOvk0lK+6Po6N0ejcvVtSmgVlSN6iIS+79Q1wvTpQHyu7Rvl84lPwMJCfr3SsqY2\nJbSKyhE9RELfd+oaYfqIuO+1ffLOr6zA3Xdv7RDW16NtsuxDH6pXaPMSvpUNTxWiDXwM+XVvcsYG\nMO14XV2tJwrEx+nYVgRKXc5nRdSInoGibgZImhCFhBumnXf6eJ9rjUbOXXjh2d9Ho36LoiJqRM/w\nFXqZbvpEmt380KF000ieKSLN6Qrnmij27oWDB8/u+/3vwyuvnD3Xj35UfZuboOoZuUJ0DZ/eoO5N\nI3pPqpiVOm2KCBnF5mWh9Bn9dm1WaV7CNY3oRYdBI/qOUmZ1p1DHa1WRM3kj3rTjpqkz5r8oSfdo\nEkXUiCHg0xvUvc3MiL6ssy/0eJ83gLwRvc+I13f06/v2UHbUH3J81qIoXXjjECID5IztIE3ngq8i\ncsbHXOPbafl0PE13hnLAih4joe8iTS9q4St6WZ1H1oh3NIo235G3j6iGCG9SvUOFO+kebd8e1i4h\nWkJC30XaGD2WNYNUWWefjse3Mww1KWV1ppP3aDRybmEhv3MUogNI6LtIHyfkVF3nvI7Ht2MJMSmF\ndEwy5YgeIaHvKk2HF1aRj71InYu207djyTIphfoOfM6rNWNFB5HQi+IrLJWlCodqXifhM6Iv0plq\nRC96hK/QW7RvuywvL7vNzc22qzE8tm1LXgh7fh5ef72+66bF3C8tRTN3q2BjA269NZLhJIpeaxzr\nPx1bPxrBL/1SNAv5+PFo7sL+/UpmJlrFzB50zi3n7acJU12jzISqaZJEPqu8KppYnGNlJcpkmcTC\nQvFJTuPslNP58b//fbjrrm5N9hLCEwl9l6h65uj8vH95Vgfjky9n8vcdO5KvW3XO+E99KkpjPCnK\no1GU7rjISHvcjltvhRdfzN+/yZWthChDnm0HuBt4Dnh4ouxO4GngcLztmfjtY8BR4DHgvT72I9no\nY6q2D/va6LNs6nn29rQ49L6FKIaGa8pJKzoAVTljgXcC1yYI/b9L2PdtwHeA84ArgSeA+bxrlBL6\nriXJKkNexEdoW9fXnbvggrPnmZtLdsRmOTbHETtpnU/asaNRv55LaLimnLSiA/gKfa7pxjn358AL\nni8INwNfcs792Dn3vXhkf73nseF0MUlWGbKSloW2dbz/D394tuz88+Ed7zh33yzbeZo9f3xM2rEv\nvBA5Q7/whejvW28t73OokyL+AyU8E33BpzcAdnPuiP5J4AiRaefiuPwPgF+Z2O9zwAdSzrkP2AQ2\nd+3aVaw7G1ooXJaZJK2tZsmj5qrTD4eO6Mf1qWKyVRNvbWntmJ/fuppXn95SxOChyjj6BKF/CzBP\n5MzdD9wdl/9hgtD/y7zzFzbdDHFyS5qo+UwQmhTRkHsTap+etN+PRum/V5XELa+zqKIj6OOsZTHz\n1Cr0ab8ROWI/NvHb14Eb885fWOiHNqKfZFpE5+b8RNhnpJ12Pd/zpzlpYetyglV0xEXSKBcV6CH5\ne8RMUPeI/tKJ7x8hsssD/CxbnbHfpU5n7FBHYevrUeRKqDllUkRD7k3W6HvajJGXvrhoJso08jqL\nIXf2QuRQmdADXwROAq8BJ4BfBb4APBTb6O+dEv41omibx4CbfCqhqJspikaATAucz70parZpIrd8\n1r0Yt7PI8opD+/ciZpZKR/R1b4qjnyIkYVfZt5mijtg8J+xk2l+f3O5pAlx0YZS0/PVDfAMUM4uE\nvs9kie9kfHrowh9JZI2I8xYdSTrm3e8OF1OfSVlpo/AQ8ZaZRwwMCX2fSbPRLyxUP/rMEr+skM60\nDiDttywxLSvAvuaYIs5hmXpEh5HQ953pqJvJaJaqrxOS/qCoWSlLTJsKk61imcGmTD3qYIQHEnrh\nT55pZPK3IiJf94g+pJ19WDhcvgThiYS+LYY+EvNd8KMpG30oIedqa0KefAnCEwl9G8zCSMwnHLNI\nqoCQqJs6TVmTtCW4Q5zxLWpBQt8GszISW19Pz2o5GhU7X1rHkPUGUXcn2lbHPSv/jkRpJPRtMEsj\nsVARLBonn+f8bcJe3rQpbhbeDEUlSOjbYBZGYnmToZKEsUhWzrycPWmd6FB8JENph6gVCX0bDH0k\n5uM0Tfo9aXLVWMx9FlvJ8glMp3wY8v0XYgoJfVsMeSRWdvSdJOY+b0F56ZB96yfEwPAVei0OXjUr\nK9HKSmfORJ9FFqnuKmmrMOWtNpXGrl3RCk2Li1vLp1duWlmBU6eihcCXlsAs+jxwYOv9zaufEDOK\nhF74k7XUYdbvF16YXL5nTyTUBw5kC/iYvE40r35CzCgSeuFP3ug77ffzzks+36FD0WdVb0E+bwdC\nzCASeuFP3ug77fcXUtaWr9qkEvJ2IMQMIaEfOhsbsHs3zM1Fnxsb5fbNG30n/d6kSWXIPhIhCiKh\nb5sQIS5y7n374NixKP7k2LHo76RrhOwbikwqQrSKRRE67bK8vOw2NzfbrkbzjMX11VfPli0uVmdu\n2L07Euxplpai0W7RfYuwsQFra5G5Zhxto9G2EKUwswedc8u5+0noW6RucZ2bi0bn05hFpo2i+woh\nOoGv0Mt00yZ1x32H2MYVmijEYJHQt0nd4hpiG5cdPZw6/StCVIiEvk3qFtfQyUhNhCYORRzrdF4L\nUTGy0bfNLDkp63Y+N0nd/hUhPJCNvi+0Effd1qh6bW2ryEP099paM9evEuXVET1CQj9rtGlySBoB\nZ5V3mTr9K0Mxb4nOIKGfNdocVc/Ph5V3mbr8K7L9ixrIFXozu9vMnjOzhyfKdpjZfWb2ePx5cVxu\nZvZJMztqZkfM7No6Ky8K0KbJ4fTpsPIuU5fzekjmLdEZfEb0nwfeN1X2UeB+59zVwP3x3wA3AVfH\n2z7grmqqKSqjzXj5paWw8mm6ZtKow78i27+ogVyhd879OTCdfvBm4GD8/SDw/onyP44XP/kr4CIz\nu7SqyooKaDNevsy1fU0aXesMQtHENVEHPstQAbuBhyf+fnHq9/8Xf/534Ocnyu8HllPOuQ/YBDZ3\n7dpVzzpbIpk2lzssem3fJQebWDO2zvundW9FAFS5ZmyA0P9ZgtBfl3f+Qa0ZK+ohbxFx55pZM7YJ\nIR7yusOiUnyFvmjUzbNjk0z8+VxcfgK4YmK/y4FnCl5DiLP4mDSasG834SxVTn1RMUWF/l5gb/x9\nL/C1ifIPxtE3NwAvOedOlqyjEH72/Sbs23KWih7iE175ReCbwFvN7ISZ/SrwW8B7zOxx4D3x3wCH\ngO8CR4HPALfXUmsxe/iEM5Z1NPs4cuUsFX3Ex75T9yYbfQ/pqh25aL18be9ylooOgaeNXknNRDhD\nSk42JiRJ2SwlohOdRitMifoYYuZGrbAleoiyV4r6GKJDUrZ3MWAk9CKcIYqiVtgSA0ZCL8IZoig2\ntcKWEC2wre0KiB4yFr+hOSRXVvrfBiESkNCLYkgUhegNMt0IIcTAkdALIcTAkdALIcTAkdALIcTA\nkdALIcTAkdALIcTA6USuGzN7HkhInsIlwKmGq9MEald/GGKbQO3qG2ntWnLO7cw7uBNCn4aZbfok\n7Okbald/GGKbQO3qG2XbJdONEEIMHAm9EEIMnK4L/YG2K1ATald/GGKbQO3qG6Xa1WkbvRBCiPJ0\nfUQvhBCiJJ0SejN70sweMrPDZrYZl+0ws/vM7PH48+K26xlCSpvuNLOn47LDZran7XqGYmYXmdlX\nzOxvzOxRM7ux788KUtvV6+dlZm+dqPthM3vZzD7c5+eV0aZePysAM/uImT1iZg+b2RfN7Hwzu9LM\nHoif1ZfNbCHonF0y3ZjZk8BbhU0ZAAADBklEQVSyc+7URNlvAy84537LzD4KXOyc+8226hhKSpvu\nBF5xzv1OW/Uqi5kdBP7COffZ+B/dIvAf6PGzgtR2fZieP68xZjYPPA38I+AOev684Jw23UaPn5WZ\nXQb8JfA259yPzOwe4BCwB/iqc+5LZvZp4DvOubt8z9upEX0KNwMH4+8Hgfe3WBcBmNnfAd4JfA7A\nOfcT59yL9PxZZbRrSLwbeMI5d4yeP68JJts0BLYBbzKzbUQDjZPAu4CvxL8HP6uuCb0D/peZPWhm\n++KytzjnTgLEnz/VWu2KkdQmgF8zsyNmdnefXpljrgKeB/7IzL5tZp81swvo/7NKaxf0+3lNcgvw\nxfh735/XmMk2QY+flXPuaeB3gONEAv8S8CDwonPu9Xi3E8BlIeftmtC/wzl3LXATcIeZvbPtClVA\nUpvuAn4auIboYf5ui/UrwjbgWuAu59zbgR8CH223SpWQ1q6+Py8AYlPUvwD+pO26VEVCm3r9rOKO\n6WbgSuDvARcQacc0QTb3Tgm9c+6Z+PM54L8B1wPPmtmlAPHnc+3VMJykNjnnnnXOnXbOnQE+Q9TO\nPnECOOGceyD++ytEAtnrZ0VKuwbwvMbcBPxf59yz8d99f14w1aYBPKtfAL7nnHveOfca8FXgHwMX\nxaYcgMuBZ0JO2hmhN7MLzOzN4+/APwMeBu4F9sa77QW+1k4Nw0lr0/g/V8wvErWzNzjn/hZ4ysze\nGhe9G/hrevysIL1dfX9eE/wyW00cvX5eMVvaNIBndRy4wcwWzcw4+3/rG8AH4n2Cn1Vnom7M7Cqi\nES9Er9D/1Tm338xGwD3ALqKb8K+ccy+0VM0gMtr0BaJXSwc8Cfzbsa20L5jZNcBngQXgu0TRDnP0\n9FmNSWnXJ+n/81oEngKucs69FJf19v8WpLZpCP+3Pg78a+B14NvAvyGyyX8J2BGX/Ypz7sfe5+yK\n0AshhKiHzphuhBBC1IOEXgghBo6EXgghBo6EXgghBo6EXgghBo6EXgghBo6EXgghBo6EXgghBs7/\nBx/lbDO3NLCfAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0xb8015f8>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"hello\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"#import the data : please go to \n", | |
"\n", | |
"#let's first create some fake data first, and then we will use real data\n", | |
"#let's make a 400 by 80 grid (y-axis goes from 0 to 80 and x-axis goes from 0 to 400)\n", | |
"#the y-axis will be height inches, x-axis weight in lbs\n", | |
"#points with high y and x values will be colored blue for male\n", | |
"#points with low y and x values will be colored red for female\n", | |
"# (this is just an example and not intended to make fun or imply anything)\n", | |
"\n", | |
"# rigging the data: women's heights are between 56 to 65 inches , mens are between 66 to 79\n", | |
"# women's weight is between 100 to 220 and men's weight is between 225 -390 lbs (again just an example)\n", | |
"\n", | |
"#to generate random samples, loop through and generate points\n", | |
"male_heights = []\n", | |
"male_weights = []\n", | |
"female_heights = []\n", | |
"female_weights = []\n", | |
"for x in range(0, 100): #we can use some other numpy functions, but let's excercise our for loop programming skills\n", | |
" height_value_man= np.random.normal( loc=(66.0+79.0)/2, scale= 3.0)\n", | |
" weight_value_man= np.random.normal( loc=(225.0+390.0)/2, scale= 30.0)\n", | |
" male_heights.append(height_value_man)\n", | |
" male_weights.append(weight_value_man)\n", | |
" \n", | |
" height_value_woman= np.random.normal(loc=(56.0+65.0)/2, scale= 3.0)\n", | |
" weight_value_woman= np.random.normal(loc=(100.0+220.0)/2, scale= 30.0)\n", | |
" female_heights.append(height_value_woman)\n", | |
" female_weights.append(weight_value_woman)\n", | |
"\n", | |
"#let's plot these two\n", | |
"\n", | |
"plt.scatter(male_heights, male_weights, color='blue')\n", | |
"plt.scatter(female_heights, female_weights, color='red')\n", | |
"plt.show()\n", | |
"\n", | |
"\n", | |
"#create the dataframe for training\n", | |
"male_data = pd.DataFrame({'height': male_heights, 'weight':male_weights, 'class': ['male']*len(male_heights) })#.reset_index()\n", | |
"female_data = pd.DataFrame({'height': female_heights, 'weight':female_weights, 'class': ['female']*len(female_heights) })#.reset_index()\n", | |
"#vertically stack and add the data\n", | |
"training_data = pd.concat([male_data, female_data], ignore_index=True) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html\n", | |
"\n", | |
"print(\"hello\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 136, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"\n", | |
"#implementation of KNN , we will do a practical approach by seeing if we can \n", | |
"#vectorize operations <- it turns out that looping through data can cause delays \n", | |
"#as opposed to vectorizing and working on vectors (an article on this later)\n", | |
"\n", | |
"#to implement simple KNN when we get a new data ponit \n", | |
"\n", | |
"#recall that given a new data point in KNN, we need to calculate the distance between the datapoint and all the other data points\n", | |
"# we recall that performing operations on Pandas columns or operations can be very fast because of vectorization. \n", | |
"\n", | |
"#given a new data point which is dataframe format \n", | |
"def train_knn(datapoint_df, training_data):\n", | |
" \n", | |
" # make copies of datapoint so we can subtract from every row in the training_data\n", | |
" df_for_calculations = stackDataPoint(len(training_data), datapoint_df)\n", | |
" #get the training data\n", | |
" training_data_cols= training_data.columns.tolist()\n", | |
" training_data_cols.remove('class')\n", | |
" training_data_wOut_class = training_data[training_data_cols]\n", | |
" \n", | |
" #we will use the simple distance format where dist = sqrt[(x2-x1)^2 + (y2-y1)^2 + (z2-z1)^2 ... ]\n", | |
" #the dataframe is really a numpy matrix, so we can do all sorts of vectorized operations \n", | |
" #first we take difference of the featuers\n", | |
" diff = training_data_wOut_class - df_for_calculations\n", | |
" #second we square the differences of the features\n", | |
" squared = np.power(diff, 2)\n", | |
" #third we add every row's numbers together \n", | |
" added = squared.sum(axis = 1) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sum.html\n", | |
" #finally, take sqrt\n", | |
" sqrted = np.sqrt(added)\n", | |
" #add the \"distance\" column to the dataframe\n", | |
" trained_data = training_data.copy()\n", | |
" trained_data['distance'] = sqrted \n", | |
" \n", | |
" return trained_data\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 137, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" class height weight distance\n", | |
"0 male 77.255374 249.634200 56.625164\n", | |
"1 male 73.283820 248.539915 53.835486\n", | |
"2 male 73.216207 324.160223 126.312126\n", | |
"3 male 74.756932 295.854500 98.999954\n", | |
"4 male 71.514246 291.309446 93.809796\n" | |
] | |
} | |
], | |
"source": [ | |
"#running the algorithm\n", | |
"datapoint_df = pd.DataFrame({'height': 50.0, 'weight':200.0}, index=[0])\n", | |
"trained_data = train_knn(data_point, training_data)\n", | |
"\n", | |
"#let's look at the trained data\n", | |
"print(trained_data.head())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 138, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>class</th>\n", | |
" <th>height</th>\n", | |
" <th>weight</th>\n", | |
" <th>distance</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>119</th>\n", | |
" <td>female</td>\n", | |
" <td>54.348085</td>\n", | |
" <td>196.194932</td>\n", | |
" <td>5.777923</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>142</th>\n", | |
" <td>female</td>\n", | |
" <td>56.087892</td>\n", | |
" <td>199.635750</td>\n", | |
" <td>6.098779</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>106</th>\n", | |
" <td>female</td>\n", | |
" <td>58.196020</td>\n", | |
" <td>191.492102</td>\n", | |
" <td>11.813512</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>145</th>\n", | |
" <td>female</td>\n", | |
" <td>60.845477</td>\n", | |
" <td>193.795467</td>\n", | |
" <td>12.494824</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>182</th>\n", | |
" <td>female</td>\n", | |
" <td>62.568468</td>\n", | |
" <td>198.674215</td>\n", | |
" <td>12.638200</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" class height weight distance\n", | |
"119 female 54.348085 196.194932 5.777923\n", | |
"142 female 56.087892 199.635750 6.098779\n", | |
"106 female 58.196020 191.492102 11.813512\n", | |
"145 female 60.845477 193.795467 12.494824\n", | |
"182 female 62.568468 198.674215 12.638200" | |
] | |
}, | |
"execution_count": 138, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#looking above, given the point we calculated all the distances \n", | |
"#now if we want to take the k nearest neighbor, we use pandas to sort on the distance value\n", | |
"#trained_data.sort(columns=['distance'], ascending=[0] )\n", | |
"trained_data = trained_data.sort_values(by=['distance'], ascending=[1]) #ascending = [0] sorts on descending order\n", | |
"trained_data.head() # the values appear to be sorted " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 139, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>class</th>\n", | |
" <th>height</th>\n", | |
" <th>weight</th>\n", | |
" <th>distance</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>119</th>\n", | |
" <td>female</td>\n", | |
" <td>54.348085</td>\n", | |
" <td>196.194932</td>\n", | |
" <td>5.777923</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>142</th>\n", | |
" <td>female</td>\n", | |
" <td>56.087892</td>\n", | |
" <td>199.635750</td>\n", | |
" <td>6.098779</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>106</th>\n", | |
" <td>female</td>\n", | |
" <td>58.196020</td>\n", | |
" <td>191.492102</td>\n", | |
" <td>11.813512</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>145</th>\n", | |
" <td>female</td>\n", | |
" <td>60.845477</td>\n", | |
" <td>193.795467</td>\n", | |
" <td>12.494824</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>182</th>\n", | |
" <td>female</td>\n", | |
" <td>62.568468</td>\n", | |
" <td>198.674215</td>\n", | |
" <td>12.638200</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>173</th>\n", | |
" <td>female</td>\n", | |
" <td>56.726977</td>\n", | |
" <td>188.705401</td>\n", | |
" <td>13.146109</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>114</th>\n", | |
" <td>female</td>\n", | |
" <td>61.346559</td>\n", | |
" <td>206.961975</td>\n", | |
" <td>13.312156</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>117</th>\n", | |
" <td>female</td>\n", | |
" <td>61.085608</td>\n", | |
" <td>190.020106</td>\n", | |
" <td>14.916064</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>107</th>\n", | |
" <td>female</td>\n", | |
" <td>61.954408</td>\n", | |
" <td>187.830090</td>\n", | |
" <td>17.059149</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>199</th>\n", | |
" <td>female</td>\n", | |
" <td>63.641643</td>\n", | |
" <td>189.556645</td>\n", | |
" <td>17.180165</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" class height weight distance\n", | |
"119 female 54.348085 196.194932 5.777923\n", | |
"142 female 56.087892 199.635750 6.098779\n", | |
"106 female 58.196020 191.492102 11.813512\n", | |
"145 female 60.845477 193.795467 12.494824\n", | |
"182 female 62.568468 198.674215 12.638200\n", | |
"173 female 56.726977 188.705401 13.146109\n", | |
"114 female 61.346559 206.961975 13.312156\n", | |
"117 female 61.085608 190.020106 14.916064\n", | |
"107 female 61.954408 187.830090 17.059149\n", | |
"199 female 63.641643 189.556645 17.180165" | |
] | |
}, | |
"execution_count": 139, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# to make a guess we decide on a K value -> such as 10, and then we use the .head() function to return the top 10 rows \n", | |
"#with the shortest distances\n", | |
"k_val = 10\n", | |
"voters = trained_data.head(k_val)\n", | |
"voters # voters are the value" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({'female': 10})\n", | |
"female\n" | |
] | |
} | |
], | |
"source": [ | |
"#now we just count the number of occurances of the classes, let's use a class called Counter\n", | |
"from collections import Counter\n", | |
"\n", | |
"votes = Counter(voters['class'])\n", | |
"print(votes) # to print the counts of the data\n", | |
"print(votes.most_common(1)[0][0]) # to get the most common count value \n", | |
"#and viola we have classified that our data point was a female! \n", | |
"\n", | |
"#let's write a quick function that encapsulates all of this: \n", | |
"def classifyGivenTrainedData(trained_data, k_val):\n", | |
" trained_data = trained_data.sort_values(by=['distance'], ascending=[1]) #ascending = [0] sorts on descending order\n", | |
" voters = trained_data.head(k_val)\n", | |
" votes = Counter(voters['class'])\n", | |
" return votes.most_common(1)[0][0], votes # to get the most common count value \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 141, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"female\n", | |
"Counter({'female': 19, 'male': 11})\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"\n", | |
"\n", | |
"#running the algorithm\n", | |
"datapoint_df = pd.DataFrame({'height': 60.0, 'weight':230.0}, index=[0])\n", | |
"trained_data = train_knn(datapoint_df, training_data)\n", | |
"classification, votes = classifyGivenTrainedData(trained_data, k_val = 30)\n", | |
"print(classification)\n", | |
"print(votes)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.14" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment