Skip to content

Instantly share code, notes, and snippets.

@yifeihuang
Created December 28, 2020 22:25
Show Gist options
  • Save yifeihuang/adc7be2931f35b14570627e938ace523 to your computer and use it in GitHub Desktop.
Save yifeihuang/adc7be2931f35b14570627e938ace523 to your computer and use it in GitHub Desktop.
Explain GBT churn model with SHAP
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import shap\n",
"\n",
"from sklearn.metrics import balanced_accuracy_score, accuracy_score\n",
"from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>account_length</th>\n",
" <th>area_code</th>\n",
" <th>international_plan</th>\n",
" <th>voice_mail_plan</th>\n",
" <th>number_vmail_messages</th>\n",
" <th>total_day_minutes</th>\n",
" <th>total_day_calls</th>\n",
" <th>total_day_charge</th>\n",
" <th>total_eve_minutes</th>\n",
" <th>total_eve_calls</th>\n",
" <th>total_eve_charge</th>\n",
" <th>total_night_minutes</th>\n",
" <th>total_night_calls</th>\n",
" <th>total_night_charge</th>\n",
" <th>total_intl_minutes</th>\n",
" <th>total_intl_calls</th>\n",
" <th>total_intl_charge</th>\n",
" <th>number_customer_service_calls</th>\n",
" <th>churn</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OH</td>\n",
" <td>107</td>\n",
" <td>area_code_415</td>\n",
" <td>no</td>\n",
" <td>yes</td>\n",
" <td>26</td>\n",
" <td>161.6</td>\n",
" <td>123</td>\n",
" <td>27.47</td>\n",
" <td>195.5</td>\n",
" <td>103</td>\n",
" <td>16.62</td>\n",
" <td>254.4</td>\n",
" <td>103</td>\n",
" <td>11.45</td>\n",
" <td>13.7</td>\n",
" <td>3</td>\n",
" <td>3.70</td>\n",
" <td>1</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NJ</td>\n",
" <td>137</td>\n",
" <td>area_code_415</td>\n",
" <td>no</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>243.4</td>\n",
" <td>114</td>\n",
" <td>41.38</td>\n",
" <td>121.2</td>\n",
" <td>110</td>\n",
" <td>10.30</td>\n",
" <td>162.6</td>\n",
" <td>104</td>\n",
" <td>7.32</td>\n",
" <td>12.2</td>\n",
" <td>5</td>\n",
" <td>3.29</td>\n",
" <td>0</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>OH</td>\n",
" <td>84</td>\n",
" <td>area_code_408</td>\n",
" <td>yes</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>299.4</td>\n",
" <td>71</td>\n",
" <td>50.90</td>\n",
" <td>61.9</td>\n",
" <td>88</td>\n",
" <td>5.26</td>\n",
" <td>196.9</td>\n",
" <td>89</td>\n",
" <td>8.86</td>\n",
" <td>6.6</td>\n",
" <td>7</td>\n",
" <td>1.78</td>\n",
" <td>2</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>OK</td>\n",
" <td>75</td>\n",
" <td>area_code_415</td>\n",
" <td>yes</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>166.7</td>\n",
" <td>113</td>\n",
" <td>28.34</td>\n",
" <td>148.3</td>\n",
" <td>122</td>\n",
" <td>12.61</td>\n",
" <td>186.9</td>\n",
" <td>121</td>\n",
" <td>8.41</td>\n",
" <td>10.1</td>\n",
" <td>3</td>\n",
" <td>2.73</td>\n",
" <td>3</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MA</td>\n",
" <td>121</td>\n",
" <td>area_code_510</td>\n",
" <td>no</td>\n",
" <td>yes</td>\n",
" <td>24</td>\n",
" <td>218.2</td>\n",
" <td>88</td>\n",
" <td>37.09</td>\n",
" <td>348.5</td>\n",
" <td>108</td>\n",
" <td>29.62</td>\n",
" <td>212.6</td>\n",
" <td>118</td>\n",
" <td>9.57</td>\n",
" <td>7.5</td>\n",
" <td>7</td>\n",
" <td>2.03</td>\n",
" <td>3</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4245</th>\n",
" <td>MT</td>\n",
" <td>83</td>\n",
" <td>area_code_415</td>\n",
" <td>no</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>188.3</td>\n",
" <td>70</td>\n",
" <td>32.01</td>\n",
" <td>243.8</td>\n",
" <td>88</td>\n",
" <td>20.72</td>\n",
" <td>213.7</td>\n",
" <td>79</td>\n",
" <td>9.62</td>\n",
" <td>10.3</td>\n",
" <td>6</td>\n",
" <td>2.78</td>\n",
" <td>0</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4246</th>\n",
" <td>WV</td>\n",
" <td>73</td>\n",
" <td>area_code_408</td>\n",
" <td>no</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>177.9</td>\n",
" <td>89</td>\n",
" <td>30.24</td>\n",
" <td>131.2</td>\n",
" <td>82</td>\n",
" <td>11.15</td>\n",
" <td>186.2</td>\n",
" <td>89</td>\n",
" <td>8.38</td>\n",
" <td>11.5</td>\n",
" <td>6</td>\n",
" <td>3.11</td>\n",
" <td>3</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4247</th>\n",
" <td>NC</td>\n",
" <td>75</td>\n",
" <td>area_code_408</td>\n",
" <td>no</td>\n",
" <td>no</td>\n",
" <td>0</td>\n",
" <td>170.7</td>\n",
" <td>101</td>\n",
" <td>29.02</td>\n",
" <td>193.1</td>\n",
" <td>126</td>\n",
" <td>16.41</td>\n",
" <td>129.1</td>\n",
" <td>104</td>\n",
" <td>5.81</td>\n",
" <td>6.9</td>\n",
" <td>7</td>\n",
" <td>1.86</td>\n",
" <td>1</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4248</th>\n",
" <td>HI</td>\n",
" <td>50</td>\n",
" <td>area_code_408</td>\n",
" <td>no</td>\n",
" <td>yes</td>\n",
" <td>40</td>\n",
" <td>235.7</td>\n",
" <td>127</td>\n",
" <td>40.07</td>\n",
" <td>223.0</td>\n",
" <td>126</td>\n",
" <td>18.96</td>\n",
" <td>297.5</td>\n",
" <td>116</td>\n",
" <td>13.39</td>\n",
" <td>9.9</td>\n",
" <td>5</td>\n",
" <td>2.67</td>\n",
" <td>2</td>\n",
" <td>no</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4249</th>\n",
" <td>VT</td>\n",
" <td>86</td>\n",
" <td>area_code_415</td>\n",
" <td>no</td>\n",
" <td>yes</td>\n",
" <td>34</td>\n",
" <td>129.4</td>\n",
" <td>102</td>\n",
" <td>22.00</td>\n",
" <td>267.1</td>\n",
" <td>104</td>\n",
" <td>22.70</td>\n",
" <td>154.8</td>\n",
" <td>100</td>\n",
" <td>6.97</td>\n",
" <td>9.3</td>\n",
" <td>16</td>\n",
" <td>2.51</td>\n",
" <td>0</td>\n",
" <td>no</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4250 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" state account_length area_code international_plan voice_mail_plan \\\n",
"0 OH 107 area_code_415 no yes \n",
"1 NJ 137 area_code_415 no no \n",
"2 OH 84 area_code_408 yes no \n",
"3 OK 75 area_code_415 yes no \n",
"4 MA 121 area_code_510 no yes \n",
"... ... ... ... ... ... \n",
"4245 MT 83 area_code_415 no no \n",
"4246 WV 73 area_code_408 no no \n",
"4247 NC 75 area_code_408 no no \n",
"4248 HI 50 area_code_408 no yes \n",
"4249 VT 86 area_code_415 no yes \n",
"\n",
" number_vmail_messages total_day_minutes total_day_calls \\\n",
"0 26 161.6 123 \n",
"1 0 243.4 114 \n",
"2 0 299.4 71 \n",
"3 0 166.7 113 \n",
"4 24 218.2 88 \n",
"... ... ... ... \n",
"4245 0 188.3 70 \n",
"4246 0 177.9 89 \n",
"4247 0 170.7 101 \n",
"4248 40 235.7 127 \n",
"4249 34 129.4 102 \n",
"\n",
" total_day_charge total_eve_minutes total_eve_calls total_eve_charge \\\n",
"0 27.47 195.5 103 16.62 \n",
"1 41.38 121.2 110 10.30 \n",
"2 50.90 61.9 88 5.26 \n",
"3 28.34 148.3 122 12.61 \n",
"4 37.09 348.5 108 29.62 \n",
"... ... ... ... ... \n",
"4245 32.01 243.8 88 20.72 \n",
"4246 30.24 131.2 82 11.15 \n",
"4247 29.02 193.1 126 16.41 \n",
"4248 40.07 223.0 126 18.96 \n",
"4249 22.00 267.1 104 22.70 \n",
"\n",
" total_night_minutes total_night_calls total_night_charge \\\n",
"0 254.4 103 11.45 \n",
"1 162.6 104 7.32 \n",
"2 196.9 89 8.86 \n",
"3 186.9 121 8.41 \n",
"4 212.6 118 9.57 \n",
"... ... ... ... \n",
"4245 213.7 79 9.62 \n",
"4246 186.2 89 8.38 \n",
"4247 129.1 104 5.81 \n",
"4248 297.5 116 13.39 \n",
"4249 154.8 100 6.97 \n",
"\n",
" total_intl_minutes total_intl_calls total_intl_charge \\\n",
"0 13.7 3 3.70 \n",
"1 12.2 5 3.29 \n",
"2 6.6 7 1.78 \n",
"3 10.1 3 2.73 \n",
"4 7.5 7 2.03 \n",
"... ... ... ... \n",
"4245 10.3 6 2.78 \n",
"4246 11.5 6 3.11 \n",
"4247 6.9 7 1.86 \n",
"4248 9.9 5 2.67 \n",
"4249 9.3 16 2.51 \n",
"\n",
" number_customer_service_calls churn \n",
"0 1 no \n",
"1 0 no \n",
"2 2 no \n",
"3 3 no \n",
"4 3 no \n",
"... ... ... \n",
"4245 0 no \n",
"4246 3 no \n",
"4247 1 no \n",
"4248 2 no \n",
"4249 0 no \n",
"\n",
"[4250 rows x 20 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./data/train.csv')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account_length</th>\n",
" <th>international_plan</th>\n",
" <th>voice_mail_plan</th>\n",
" <th>number_vmail_messages</th>\n",
" <th>total_day_minutes</th>\n",
" <th>total_day_calls</th>\n",
" <th>total_day_charge</th>\n",
" <th>total_eve_minutes</th>\n",
" <th>total_eve_calls</th>\n",
" <th>total_eve_charge</th>\n",
" <th>...</th>\n",
" <th>UT</th>\n",
" <th>VA</th>\n",
" <th>VT</th>\n",
" <th>WA</th>\n",
" <th>WI</th>\n",
" <th>WV</th>\n",
" <th>WY</th>\n",
" <th>area_code_408</th>\n",
" <th>area_code_415</th>\n",
" <th>area_code_510</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.170399</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.366857</td>\n",
" <td>-0.345510</td>\n",
" <td>1.163449</td>\n",
" <td>-0.345788</td>\n",
" <td>-0.093025</td>\n",
" <td>0.141841</td>\n",
" <td>-0.092493</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.926186</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>1.169136</td>\n",
" <td>0.710014</td>\n",
" <td>1.169295</td>\n",
" <td>-1.571820</td>\n",
" <td>0.493490</td>\n",
" <td>-1.572341</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.409038</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>2.206058</td>\n",
" <td>-1.456398</td>\n",
" <td>2.206218</td>\n",
" <td>-2.752070</td>\n",
" <td>-0.611691</td>\n",
" <td>-2.752473</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.635774</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>-0.251076</td>\n",
" <td>0.659633</td>\n",
" <td>-0.251027</td>\n",
" <td>-1.032448</td>\n",
" <td>1.096316</td>\n",
" <td>-1.031447</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.523099</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.218029</td>\n",
" <td>0.702522</td>\n",
" <td>-0.599910</td>\n",
" <td>0.702027</td>\n",
" <td>2.952139</td>\n",
" <td>0.393019</td>\n",
" <td>2.951497</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4245</th>\n",
" <td>-0.434231</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>0.148880</td>\n",
" <td>-1.506780</td>\n",
" <td>0.148711</td>\n",
" <td>0.868291</td>\n",
" <td>-0.611691</td>\n",
" <td>0.867535</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4246</th>\n",
" <td>-0.686160</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>-0.043691</td>\n",
" <td>-0.549528</td>\n",
" <td>-0.044078</td>\n",
" <td>-1.372790</td>\n",
" <td>-0.913104</td>\n",
" <td>-1.373311</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4247</th>\n",
" <td>-0.635774</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>-0.567911</td>\n",
" <td>-0.177010</td>\n",
" <td>0.055052</td>\n",
" <td>-0.176961</td>\n",
" <td>-0.140792</td>\n",
" <td>1.297257</td>\n",
" <td>-0.141665</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4248</th>\n",
" <td>-1.265596</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2.408655</td>\n",
" <td>1.026560</td>\n",
" <td>1.364976</td>\n",
" <td>1.026609</td>\n",
" <td>0.454308</td>\n",
" <td>1.297257</td>\n",
" <td>0.455425</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4249</th>\n",
" <td>-0.358652</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.962170</td>\n",
" <td>-0.941739</td>\n",
" <td>0.105434</td>\n",
" <td>-0.941582</td>\n",
" <td>1.332032</td>\n",
" <td>0.192077</td>\n",
" <td>1.331158</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4250 rows × 72 columns</p>\n",
"</div>"
],
"text/plain": [
" account_length international_plan voice_mail_plan \\\n",
"0 0.170399 0 1 \n",
"1 0.926186 0 0 \n",
"2 -0.409038 1 0 \n",
"3 -0.635774 1 0 \n",
"4 0.523099 0 1 \n",
"... ... ... ... \n",
"4245 -0.434231 0 0 \n",
"4246 -0.686160 0 0 \n",
"4247 -0.635774 0 0 \n",
"4248 -1.265596 0 1 \n",
"4249 -0.358652 0 1 \n",
"\n",
" number_vmail_messages total_day_minutes total_day_calls \\\n",
"0 1.366857 -0.345510 1.163449 \n",
"1 -0.567911 1.169136 0.710014 \n",
"2 -0.567911 2.206058 -1.456398 \n",
"3 -0.567911 -0.251076 0.659633 \n",
"4 1.218029 0.702522 -0.599910 \n",
"... ... ... ... \n",
"4245 -0.567911 0.148880 -1.506780 \n",
"4246 -0.567911 -0.043691 -0.549528 \n",
"4247 -0.567911 -0.177010 0.055052 \n",
"4248 2.408655 1.026560 1.364976 \n",
"4249 1.962170 -0.941739 0.105434 \n",
"\n",
" total_day_charge total_eve_minutes total_eve_calls total_eve_charge \\\n",
"0 -0.345788 -0.093025 0.141841 -0.092493 \n",
"1 1.169295 -1.571820 0.493490 -1.572341 \n",
"2 2.206218 -2.752070 -0.611691 -2.752473 \n",
"3 -0.251027 -1.032448 1.096316 -1.031447 \n",
"4 0.702027 2.952139 0.393019 2.951497 \n",
"... ... ... ... ... \n",
"4245 0.148711 0.868291 -0.611691 0.867535 \n",
"4246 -0.044078 -1.372790 -0.913104 -1.373311 \n",
"4247 -0.176961 -0.140792 1.297257 -0.141665 \n",
"4248 1.026609 0.454308 1.297257 0.455425 \n",
"4249 -0.941582 1.332032 0.192077 1.331158 \n",
"\n",
" ... UT VA VT WA WI WV WY area_code_408 area_code_415 \\\n",
"0 ... 0 0 0 0 0 0 0 0 1 \n",
"1 ... 0 0 0 0 0 0 0 0 1 \n",
"2 ... 0 0 0 0 0 0 0 1 0 \n",
"3 ... 0 0 0 0 0 0 0 0 1 \n",
"4 ... 0 0 0 0 0 0 0 0 0 \n",
"... ... .. .. .. .. .. .. .. ... ... \n",
"4245 ... 0 0 0 0 0 0 0 0 1 \n",
"4246 ... 0 0 0 0 0 1 0 1 0 \n",
"4247 ... 0 0 0 0 0 0 0 1 0 \n",
"4248 ... 0 0 0 0 0 0 0 1 0 \n",
"4249 ... 0 0 1 0 0 0 0 0 1 \n",
"\n",
" area_code_510 \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 \n",
"... ... \n",
"4245 0 \n",
"4246 0 \n",
"4247 0 \n",
"4248 0 \n",
"4249 0 \n",
"\n",
"[4250 rows x 72 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_df = pd.get_dummies(df, prefix=['', ''], prefix_sep='', columns=['state', 'area_code'])\n",
"\n",
"def convert_yesno(s):\n",
" if s == 'yes':\n",
" return 1\n",
" elif s == 'no':\n",
" return 0\n",
" else:\n",
" return None\n",
"\n",
"processed_df[['international_plan', 'voice_mail_plan', 'churn']] = [\n",
" [convert_yesno(c[0]), convert_yesno(c[1]), convert_yesno(c[2]),] for c in processed_df[['international_plan', 'voice_mail_plan', 'churn']].itertuples(index=False)\n",
"]\n",
"\n",
"FEATURES_TO_SCALE = list(df.columns)\n",
"for c in ['state', 'area_code', 'international_plan', 'voice_mail_plan', 'churn']:\n",
" FEATURES_TO_SCALE.remove(c)\n",
"\n",
"processed_df[FEATURES_TO_SCALE] = StandardScaler().fit_transform(processed_df[FEATURES_TO_SCALE])\n",
"\n",
"processed_df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The `size` parameter has been renamed to `height`; please update your code.\n"
]
},
{
"data": {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment