Created
December 28, 2020 22:25
-
-
Save yifeihuang/adc7be2931f35b14570627e938ace523 to your computer and use it in GitHub Desktop.
Explain GBT churn model with SHAP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np\n", | |
"import shap\n", | |
"\n", | |
"from sklearn.metrics import balanced_accuracy_score, accuracy_score\n", | |
"from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n", | |
"from sklearn.model_selection import GridSearchCV, train_test_split\n", | |
"from sklearn.preprocessing import StandardScaler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>state</th>\n", | |
" <th>account_length</th>\n", | |
" <th>area_code</th>\n", | |
" <th>international_plan</th>\n", | |
" <th>voice_mail_plan</th>\n", | |
" <th>number_vmail_messages</th>\n", | |
" <th>total_day_minutes</th>\n", | |
" <th>total_day_calls</th>\n", | |
" <th>total_day_charge</th>\n", | |
" <th>total_eve_minutes</th>\n", | |
" <th>total_eve_calls</th>\n", | |
" <th>total_eve_charge</th>\n", | |
" <th>total_night_minutes</th>\n", | |
" <th>total_night_calls</th>\n", | |
" <th>total_night_charge</th>\n", | |
" <th>total_intl_minutes</th>\n", | |
" <th>total_intl_calls</th>\n", | |
" <th>total_intl_charge</th>\n", | |
" <th>number_customer_service_calls</th>\n", | |
" <th>churn</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>OH</td>\n", | |
" <td>107</td>\n", | |
" <td>area_code_415</td>\n", | |
" <td>no</td>\n", | |
" <td>yes</td>\n", | |
" <td>26</td>\n", | |
" <td>161.6</td>\n", | |
" <td>123</td>\n", | |
" <td>27.47</td>\n", | |
" <td>195.5</td>\n", | |
" <td>103</td>\n", | |
" <td>16.62</td>\n", | |
" <td>254.4</td>\n", | |
" <td>103</td>\n", | |
" <td>11.45</td>\n", | |
" <td>13.7</td>\n", | |
" <td>3</td>\n", | |
" <td>3.70</td>\n", | |
" <td>1</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>NJ</td>\n", | |
" <td>137</td>\n", | |
" <td>area_code_415</td>\n", | |
" <td>no</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>243.4</td>\n", | |
" <td>114</td>\n", | |
" <td>41.38</td>\n", | |
" <td>121.2</td>\n", | |
" <td>110</td>\n", | |
" <td>10.30</td>\n", | |
" <td>162.6</td>\n", | |
" <td>104</td>\n", | |
" <td>7.32</td>\n", | |
" <td>12.2</td>\n", | |
" <td>5</td>\n", | |
" <td>3.29</td>\n", | |
" <td>0</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>OH</td>\n", | |
" <td>84</td>\n", | |
" <td>area_code_408</td>\n", | |
" <td>yes</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>299.4</td>\n", | |
" <td>71</td>\n", | |
" <td>50.90</td>\n", | |
" <td>61.9</td>\n", | |
" <td>88</td>\n", | |
" <td>5.26</td>\n", | |
" <td>196.9</td>\n", | |
" <td>89</td>\n", | |
" <td>8.86</td>\n", | |
" <td>6.6</td>\n", | |
" <td>7</td>\n", | |
" <td>1.78</td>\n", | |
" <td>2</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>OK</td>\n", | |
" <td>75</td>\n", | |
" <td>area_code_415</td>\n", | |
" <td>yes</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>166.7</td>\n", | |
" <td>113</td>\n", | |
" <td>28.34</td>\n", | |
" <td>148.3</td>\n", | |
" <td>122</td>\n", | |
" <td>12.61</td>\n", | |
" <td>186.9</td>\n", | |
" <td>121</td>\n", | |
" <td>8.41</td>\n", | |
" <td>10.1</td>\n", | |
" <td>3</td>\n", | |
" <td>2.73</td>\n", | |
" <td>3</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>MA</td>\n", | |
" <td>121</td>\n", | |
" <td>area_code_510</td>\n", | |
" <td>no</td>\n", | |
" <td>yes</td>\n", | |
" <td>24</td>\n", | |
" <td>218.2</td>\n", | |
" <td>88</td>\n", | |
" <td>37.09</td>\n", | |
" <td>348.5</td>\n", | |
" <td>108</td>\n", | |
" <td>29.62</td>\n", | |
" <td>212.6</td>\n", | |
" <td>118</td>\n", | |
" <td>9.57</td>\n", | |
" <td>7.5</td>\n", | |
" <td>7</td>\n", | |
" <td>2.03</td>\n", | |
" <td>3</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4245</th>\n", | |
" <td>MT</td>\n", | |
" <td>83</td>\n", | |
" <td>area_code_415</td>\n", | |
" <td>no</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>188.3</td>\n", | |
" <td>70</td>\n", | |
" <td>32.01</td>\n", | |
" <td>243.8</td>\n", | |
" <td>88</td>\n", | |
" <td>20.72</td>\n", | |
" <td>213.7</td>\n", | |
" <td>79</td>\n", | |
" <td>9.62</td>\n", | |
" <td>10.3</td>\n", | |
" <td>6</td>\n", | |
" <td>2.78</td>\n", | |
" <td>0</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4246</th>\n", | |
" <td>WV</td>\n", | |
" <td>73</td>\n", | |
" <td>area_code_408</td>\n", | |
" <td>no</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>177.9</td>\n", | |
" <td>89</td>\n", | |
" <td>30.24</td>\n", | |
" <td>131.2</td>\n", | |
" <td>82</td>\n", | |
" <td>11.15</td>\n", | |
" <td>186.2</td>\n", | |
" <td>89</td>\n", | |
" <td>8.38</td>\n", | |
" <td>11.5</td>\n", | |
" <td>6</td>\n", | |
" <td>3.11</td>\n", | |
" <td>3</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4247</th>\n", | |
" <td>NC</td>\n", | |
" <td>75</td>\n", | |
" <td>area_code_408</td>\n", | |
" <td>no</td>\n", | |
" <td>no</td>\n", | |
" <td>0</td>\n", | |
" <td>170.7</td>\n", | |
" <td>101</td>\n", | |
" <td>29.02</td>\n", | |
" <td>193.1</td>\n", | |
" <td>126</td>\n", | |
" <td>16.41</td>\n", | |
" <td>129.1</td>\n", | |
" <td>104</td>\n", | |
" <td>5.81</td>\n", | |
" <td>6.9</td>\n", | |
" <td>7</td>\n", | |
" <td>1.86</td>\n", | |
" <td>1</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4248</th>\n", | |
" <td>HI</td>\n", | |
" <td>50</td>\n", | |
" <td>area_code_408</td>\n", | |
" <td>no</td>\n", | |
" <td>yes</td>\n", | |
" <td>40</td>\n", | |
" <td>235.7</td>\n", | |
" <td>127</td>\n", | |
" <td>40.07</td>\n", | |
" <td>223.0</td>\n", | |
" <td>126</td>\n", | |
" <td>18.96</td>\n", | |
" <td>297.5</td>\n", | |
" <td>116</td>\n", | |
" <td>13.39</td>\n", | |
" <td>9.9</td>\n", | |
" <td>5</td>\n", | |
" <td>2.67</td>\n", | |
" <td>2</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4249</th>\n", | |
" <td>VT</td>\n", | |
" <td>86</td>\n", | |
" <td>area_code_415</td>\n", | |
" <td>no</td>\n", | |
" <td>yes</td>\n", | |
" <td>34</td>\n", | |
" <td>129.4</td>\n", | |
" <td>102</td>\n", | |
" <td>22.00</td>\n", | |
" <td>267.1</td>\n", | |
" <td>104</td>\n", | |
" <td>22.70</td>\n", | |
" <td>154.8</td>\n", | |
" <td>100</td>\n", | |
" <td>6.97</td>\n", | |
" <td>9.3</td>\n", | |
" <td>16</td>\n", | |
" <td>2.51</td>\n", | |
" <td>0</td>\n", | |
" <td>no</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>4250 rows × 20 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" state account_length area_code international_plan voice_mail_plan \\\n", | |
"0 OH 107 area_code_415 no yes \n", | |
"1 NJ 137 area_code_415 no no \n", | |
"2 OH 84 area_code_408 yes no \n", | |
"3 OK 75 area_code_415 yes no \n", | |
"4 MA 121 area_code_510 no yes \n", | |
"... ... ... ... ... ... \n", | |
"4245 MT 83 area_code_415 no no \n", | |
"4246 WV 73 area_code_408 no no \n", | |
"4247 NC 75 area_code_408 no no \n", | |
"4248 HI 50 area_code_408 no yes \n", | |
"4249 VT 86 area_code_415 no yes \n", | |
"\n", | |
" number_vmail_messages total_day_minutes total_day_calls \\\n", | |
"0 26 161.6 123 \n", | |
"1 0 243.4 114 \n", | |
"2 0 299.4 71 \n", | |
"3 0 166.7 113 \n", | |
"4 24 218.2 88 \n", | |
"... ... ... ... \n", | |
"4245 0 188.3 70 \n", | |
"4246 0 177.9 89 \n", | |
"4247 0 170.7 101 \n", | |
"4248 40 235.7 127 \n", | |
"4249 34 129.4 102 \n", | |
"\n", | |
" total_day_charge total_eve_minutes total_eve_calls total_eve_charge \\\n", | |
"0 27.47 195.5 103 16.62 \n", | |
"1 41.38 121.2 110 10.30 \n", | |
"2 50.90 61.9 88 5.26 \n", | |
"3 28.34 148.3 122 12.61 \n", | |
"4 37.09 348.5 108 29.62 \n", | |
"... ... ... ... ... \n", | |
"4245 32.01 243.8 88 20.72 \n", | |
"4246 30.24 131.2 82 11.15 \n", | |
"4247 29.02 193.1 126 16.41 \n", | |
"4248 40.07 223.0 126 18.96 \n", | |
"4249 22.00 267.1 104 22.70 \n", | |
"\n", | |
" total_night_minutes total_night_calls total_night_charge \\\n", | |
"0 254.4 103 11.45 \n", | |
"1 162.6 104 7.32 \n", | |
"2 196.9 89 8.86 \n", | |
"3 186.9 121 8.41 \n", | |
"4 212.6 118 9.57 \n", | |
"... ... ... ... \n", | |
"4245 213.7 79 9.62 \n", | |
"4246 186.2 89 8.38 \n", | |
"4247 129.1 104 5.81 \n", | |
"4248 297.5 116 13.39 \n", | |
"4249 154.8 100 6.97 \n", | |
"\n", | |
" total_intl_minutes total_intl_calls total_intl_charge \\\n", | |
"0 13.7 3 3.70 \n", | |
"1 12.2 5 3.29 \n", | |
"2 6.6 7 1.78 \n", | |
"3 10.1 3 2.73 \n", | |
"4 7.5 7 2.03 \n", | |
"... ... ... ... \n", | |
"4245 10.3 6 2.78 \n", | |
"4246 11.5 6 3.11 \n", | |
"4247 6.9 7 1.86 \n", | |
"4248 9.9 5 2.67 \n", | |
"4249 9.3 16 2.51 \n", | |
"\n", | |
" number_customer_service_calls churn \n", | |
"0 1 no \n", | |
"1 0 no \n", | |
"2 2 no \n", | |
"3 3 no \n", | |
"4 3 no \n", | |
"... ... ... \n", | |
"4245 0 no \n", | |
"4246 3 no \n", | |
"4247 1 no \n", | |
"4248 2 no \n", | |
"4249 0 no \n", | |
"\n", | |
"[4250 rows x 20 columns]" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv('./data/train.csv')\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>account_length</th>\n", | |
" <th>international_plan</th>\n", | |
" <th>voice_mail_plan</th>\n", | |
" <th>number_vmail_messages</th>\n", | |
" <th>total_day_minutes</th>\n", | |
" <th>total_day_calls</th>\n", | |
" <th>total_day_charge</th>\n", | |
" <th>total_eve_minutes</th>\n", | |
" <th>total_eve_calls</th>\n", | |
" <th>total_eve_charge</th>\n", | |
" <th>...</th>\n", | |
" <th>UT</th>\n", | |
" <th>VA</th>\n", | |
" <th>VT</th>\n", | |
" <th>WA</th>\n", | |
" <th>WI</th>\n", | |
" <th>WV</th>\n", | |
" <th>WY</th>\n", | |
" <th>area_code_408</th>\n", | |
" <th>area_code_415</th>\n", | |
" <th>area_code_510</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.170399</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1.366857</td>\n", | |
" <td>-0.345510</td>\n", | |
" <td>1.163449</td>\n", | |
" <td>-0.345788</td>\n", | |
" <td>-0.093025</td>\n", | |
" <td>0.141841</td>\n", | |
" <td>-0.092493</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.926186</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>1.169136</td>\n", | |
" <td>0.710014</td>\n", | |
" <td>1.169295</td>\n", | |
" <td>-1.571820</td>\n", | |
" <td>0.493490</td>\n", | |
" <td>-1.572341</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>-0.409038</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>2.206058</td>\n", | |
" <td>-1.456398</td>\n", | |
" <td>2.206218</td>\n", | |
" <td>-2.752070</td>\n", | |
" <td>-0.611691</td>\n", | |
" <td>-2.752473</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>-0.635774</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>-0.251076</td>\n", | |
" <td>0.659633</td>\n", | |
" <td>-0.251027</td>\n", | |
" <td>-1.032448</td>\n", | |
" <td>1.096316</td>\n", | |
" <td>-1.031447</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.523099</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1.218029</td>\n", | |
" <td>0.702522</td>\n", | |
" <td>-0.599910</td>\n", | |
" <td>0.702027</td>\n", | |
" <td>2.952139</td>\n", | |
" <td>0.393019</td>\n", | |
" <td>2.951497</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4245</th>\n", | |
" <td>-0.434231</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>0.148880</td>\n", | |
" <td>-1.506780</td>\n", | |
" <td>0.148711</td>\n", | |
" <td>0.868291</td>\n", | |
" <td>-0.611691</td>\n", | |
" <td>0.867535</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4246</th>\n", | |
" <td>-0.686160</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>-0.043691</td>\n", | |
" <td>-0.549528</td>\n", | |
" <td>-0.044078</td>\n", | |
" <td>-1.372790</td>\n", | |
" <td>-0.913104</td>\n", | |
" <td>-1.373311</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4247</th>\n", | |
" <td>-0.635774</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>-0.567911</td>\n", | |
" <td>-0.177010</td>\n", | |
" <td>0.055052</td>\n", | |
" <td>-0.176961</td>\n", | |
" <td>-0.140792</td>\n", | |
" <td>1.297257</td>\n", | |
" <td>-0.141665</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4248</th>\n", | |
" <td>-1.265596</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2.408655</td>\n", | |
" <td>1.026560</td>\n", | |
" <td>1.364976</td>\n", | |
" <td>1.026609</td>\n", | |
" <td>0.454308</td>\n", | |
" <td>1.297257</td>\n", | |
" <td>0.455425</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4249</th>\n", | |
" <td>-0.358652</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1.962170</td>\n", | |
" <td>-0.941739</td>\n", | |
" <td>0.105434</td>\n", | |
" <td>-0.941582</td>\n", | |
" <td>1.332032</td>\n", | |
" <td>0.192077</td>\n", | |
" <td>1.331158</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>4250 rows × 72 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" account_length international_plan voice_mail_plan \\\n", | |
"0 0.170399 0 1 \n", | |
"1 0.926186 0 0 \n", | |
"2 -0.409038 1 0 \n", | |
"3 -0.635774 1 0 \n", | |
"4 0.523099 0 1 \n", | |
"... ... ... ... \n", | |
"4245 -0.434231 0 0 \n", | |
"4246 -0.686160 0 0 \n", | |
"4247 -0.635774 0 0 \n", | |
"4248 -1.265596 0 1 \n", | |
"4249 -0.358652 0 1 \n", | |
"\n", | |
" number_vmail_messages total_day_minutes total_day_calls \\\n", | |
"0 1.366857 -0.345510 1.163449 \n", | |
"1 -0.567911 1.169136 0.710014 \n", | |
"2 -0.567911 2.206058 -1.456398 \n", | |
"3 -0.567911 -0.251076 0.659633 \n", | |
"4 1.218029 0.702522 -0.599910 \n", | |
"... ... ... ... \n", | |
"4245 -0.567911 0.148880 -1.506780 \n", | |
"4246 -0.567911 -0.043691 -0.549528 \n", | |
"4247 -0.567911 -0.177010 0.055052 \n", | |
"4248 2.408655 1.026560 1.364976 \n", | |
"4249 1.962170 -0.941739 0.105434 \n", | |
"\n", | |
" total_day_charge total_eve_minutes total_eve_calls total_eve_charge \\\n", | |
"0 -0.345788 -0.093025 0.141841 -0.092493 \n", | |
"1 1.169295 -1.571820 0.493490 -1.572341 \n", | |
"2 2.206218 -2.752070 -0.611691 -2.752473 \n", | |
"3 -0.251027 -1.032448 1.096316 -1.031447 \n", | |
"4 0.702027 2.952139 0.393019 2.951497 \n", | |
"... ... ... ... ... \n", | |
"4245 0.148711 0.868291 -0.611691 0.867535 \n", | |
"4246 -0.044078 -1.372790 -0.913104 -1.373311 \n", | |
"4247 -0.176961 -0.140792 1.297257 -0.141665 \n", | |
"4248 1.026609 0.454308 1.297257 0.455425 \n", | |
"4249 -0.941582 1.332032 0.192077 1.331158 \n", | |
"\n", | |
" ... UT VA VT WA WI WV WY area_code_408 area_code_415 \\\n", | |
"0 ... 0 0 0 0 0 0 0 0 1 \n", | |
"1 ... 0 0 0 0 0 0 0 0 1 \n", | |
"2 ... 0 0 0 0 0 0 0 1 0 \n", | |
"3 ... 0 0 0 0 0 0 0 0 1 \n", | |
"4 ... 0 0 0 0 0 0 0 0 0 \n", | |
"... ... .. .. .. .. .. .. .. ... ... \n", | |
"4245 ... 0 0 0 0 0 0 0 0 1 \n", | |
"4246 ... 0 0 0 0 0 1 0 1 0 \n", | |
"4247 ... 0 0 0 0 0 0 0 1 0 \n", | |
"4248 ... 0 0 0 0 0 0 0 1 0 \n", | |
"4249 ... 0 0 1 0 0 0 0 0 1 \n", | |
"\n", | |
" area_code_510 \n", | |
"0 0 \n", | |
"1 0 \n", | |
"2 0 \n", | |
"3 0 \n", | |
"4 1 \n", | |
"... ... \n", | |
"4245 0 \n", | |
"4246 0 \n", | |
"4247 0 \n", | |
"4248 0 \n", | |
"4249 0 \n", | |
"\n", | |
"[4250 rows x 72 columns]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"processed_df = pd.get_dummies(df, prefix=['', ''], prefix_sep='', columns=['state', 'area_code'])\n", | |
"\n", | |
"def convert_yesno(s):\n", | |
" if s == 'yes':\n", | |
" return 1\n", | |
" elif s == 'no':\n", | |
" return 0\n", | |
" else:\n", | |
" return None\n", | |
"\n", | |
"processed_df[['international_plan', 'voice_mail_plan', 'churn']] = [\n", | |
" [convert_yesno(c[0]), convert_yesno(c[1]), convert_yesno(c[2]),] for c in processed_df[['international_plan', 'voice_mail_plan', 'churn']].itertuples(index=False)\n", | |
"]\n", | |
"\n", | |
"FEATURES_TO_SCALE = list(df.columns)\n", | |
"for c in ['state', 'area_code', 'international_plan', 'voice_mail_plan', 'churn']:\n", | |
" FEATURES_TO_SCALE.remove(c)\n", | |
"\n", | |
"processed_df[FEATURES_TO_SCALE] = StandardScaler().fit_transform(processed_df[FEATURES_TO_SCALE])\n", | |
"\n", | |
"processed_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"The `size` parameter has been renamed to `height`; please update your code.\n" | |
] | |
}, | |
{ | |
"data": { |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment