Created
October 18, 2015 10:26
-
-
Save pronojitsaha/fbb73f94ca7a4223c9e0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# suppress pandas warnings\n", | |
"import warnings\n", | |
"warnings.simplefilter(action = \"ignore\", category = RuntimeWarning)\n", | |
"warnings.simplefilter(action = \"ignore\", category = FutureWarning)\n", | |
"\n", | |
"# imports\n", | |
"import sys\n", | |
"import xgboost as xgb\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn import preprocessing\n", | |
"import pandas as pd\n", | |
"from numpy.random import seed\n", | |
"from sklearn.cross_validation import StratifiedShuffleSplit\n", | |
"\n", | |
"# reproduce results\n", | |
"seed(786)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"((8123, 32), (2032, 31))" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Read the files\n", | |
"train = pd.read_csv('data/train_K9K1f9B.csv')\n", | |
"test = pd.read_csv('data/test_yIjzS7t.csv')\n", | |
"train.shape, test.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"unknown_cols = ['status', 'occupation', 'occupation_partner', 'region']\n", | |
"for col in unknown_cols:\n", | |
" train.ix[train[col] == 'Unknown',col] = float('nan')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I treated the unknown values in different features as missing value as well. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#For test set\n", | |
"for col in unknown_cols:\n", | |
" test.ix[test[col] == 'Unknown',col] = float('nan')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Missing values was not given any special treatment as XGB has its own way of treating them, so we leave it to the algorithm to decide the best. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The feature 'post_area' had lot of levels, so treated the less frequent areas as one (i.e. others). " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pa_counts = train.post_area.value_counts(dropna=True)\n", | |
"pa_counts_rare = list(pa_counts[pa_counts<5].index)\n", | |
"train.ix[train['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\"\n", | |
"test.ix[test['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For test set, we take the new values of 'post_area' which are not in train set as 'Others' too. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"newpostareas = list(set(test['post_area']) - set(train['post_area']))\n", | |
"test.ix[test['post_area'].isin(newpostareas), \"post_area\"] = \"Others\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"data = train.copy()\n", | |
"label = data['Account.Status'].map({'Silver': 0, 'Gold': 1})\n", | |
"\n", | |
"#I dropped the feature 'post_code' since its highest frequency was 2 and hence would not add any value to the model.\n", | |
"dropCols = ['REF_NO', 'Account.Status', 'post_code']\n", | |
"data.drop(dropCols, axis=1, inplace = True)\n", | |
"\n", | |
"y = label\n", | |
"X = pd.get_dummies(data) #converted the categorical features into 2 level factor variables. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#For test\n", | |
"test2 = test.copy()\n", | |
"testdropcols = list(set(dropCols)-set(['Account.Status']))\n", | |
"test2 = test.drop(testdropcols, axis=1)\n", | |
"\n", | |
"Final_test = pd.get_dummies(test2)\n", | |
"missingCols = list(set(X.columns)-set(Final_test.columns))\n", | |
"for col in missingCols:\n", | |
" Final_test[col] = 0\n", | |
"Final_test = Final_test[X.columns]\n", | |
"assert X.columns.equals(Final_test.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 7255\n", | |
"1 868\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"label.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"StratifiedShuffleSplit(labels=[0 0 0 ..., 0 0 0], n_iter=1, test_size=0.25, random_state=0)" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Creating a hold out set using stratified sampling as our target variable is skewed\n", | |
"holdout_fold = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)\n", | |
"holdout_fold" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for train_index, holdout_index in holdout_fold:\n", | |
" X_train = X.ix[train_index]\n", | |
" X_test = X.ix[holdout_index]\n", | |
" y_train = y[train_index]\n", | |
" y_test = y[holdout_index]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Define the evaluation function\n", | |
"def evaluation(y_predicted, y_true):\n", | |
" Ns = sum([1 if (a == 0 and b == 1) else 0 for (a,b) in zip(y_predicted,y_true)])\n", | |
" Ng = sum([1 if (a == 1 and b == 0) else 0 for (a,b) in zip(y_predicted,y_true)])\n", | |
" T = len(y_true)\n", | |
" M = (8*Ns + 2*Ng)/float(T)\n", | |
" return M" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#finding the ideal learning rate and num_rounds\n", | |
"params = {}\n", | |
"params[\"objective\"] = \"binary:logistic\"\n", | |
"params[\"max_depth\"] = 10\n", | |
"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n", | |
"params[\"eval_metric \"] = 'logloss'\n", | |
"params[\"seed\"] = 0\n", | |
"params[\"silent\"] = 1\n", | |
"plst = list(params.items())\n", | |
"num_rounds = 10000\n", | |
"\n", | |
"xgtrain = xgb.DMatrix(X_train, label=y_train) #weight= trainX_mobN_weight\n", | |
"#xgb.cv(params, xgtrain, num_rounds, nfold=4, metrics={'logloss'})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"xgb.cv gives the following output" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"[1074]\tcv-test-logloss:0.097280+0.003721\tcv-train-logloss:0.028058+0.001354 \n", | |
"[1075]\tcv-test-logloss:0.097280+0.003718\tcv-train-logloss:0.028017+0.001332 \n", | |
"[1076]\tcv-test-logloss:0.097291+0.003725\tcv-train-logloss:0.027987+0.001338 \n", | |
"[1077]\tcv-test-logloss:0.097279+0.003724\tcv-train-logloss:0.027958+0.001339 \n", | |
"[1078]\tcv-test-logloss:0.097273+0.003725\tcv-train-logloss:0.027924+0.001347 " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As one can see from above the overfitting starts at around tree number 1077." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#give higher weights to the labels 1 as the target variable is skewed\n", | |
"Xtrain_weight = [2 if data == 1 else 1 for data in y_train]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Having determined the ideal learning rate and num_rounds, we tune the other parameters\n", | |
"params = {}\n", | |
"params[\"objective\"] = \"binary:logistic\"\n", | |
"#To avoid overfitting: The first way is to directly control model complexity\n", | |
"params[\"min_child_weight\"] = 3 #The larger, the more conservative the algorithm will be.\n", | |
"params[\"max_depth\"] = 10\n", | |
"#params[\"gamma\"] = 0 #The larger, the more conservative the algorithm will be.\n", | |
"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n", | |
"#The second way is to add randomness to make training robust to noise\n", | |
"params[\"subsample\"] = 0.9\n", | |
"params[\"colsample_bytree\"] = 0.9\n", | |
"\n", | |
"#Handle Imbalanced Dataset\n", | |
"#If you care only about the ranking order (AUC) of your prediction\n", | |
"#params[\"scale_pos_weight\"] = 1 #ratio of labels in target variable\n", | |
"params[\"eval_metric \"] = 'logloss'\n", | |
"#If you care about predicting the right probability\n", | |
"params[\"max_delta_step\"]= 8 #should be high for skewed data\n", | |
"\n", | |
"params[\"seed\"] = 0\n", | |
"params[\"silent\"] = 1\n", | |
"params[\"nthread\"] = 4\n", | |
"plst = list(params.items())\n", | |
"num_rounds = 1100" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"xgtrain = xgb.DMatrix(X_train, label=y_train, weight= Xtrain_weight)\n", | |
"xgtest = xgb.DMatrix(X_test)\n", | |
"model = xgb.train(plst, xgtrain, num_rounds)\n", | |
"pred_ytest = model.predict(xgtest)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In our evaluation function we have a hgher penalty for misclassification as silver i.e. 0. So it will be better for our model to predict more number of gold i.e. 1 and go wrong on them than predict more number of silver and go wrong on them. So we chose a low sensitivity of our model for the prediction (by trial and error), in our case it is 0.30. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"predictions = [1 if pred > 0.30 else 0 for pred in pred_ytest]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.12506154603643527" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"evaluation(predictions,y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Predict on the test set\n", | |
"X_weight = [2 if data == 1 else 1 for data in y]\n", | |
"xgtrain = xgb.DMatrix(X, label=y, weight= X_weight)\n", | |
"xgtest = xgb.DMatrix(Final_test)\n", | |
"model_full = xgb.train(plst, xgtrain, num_rounds)\n", | |
"pred_Finaltest = model_full.predict(xgtest)\n", | |
"predictions_final = ['Gold' if pred > 0.30 else 'Silver' for pred in pred_Finaltest]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We save our data for ensembling with the other XGB model that we built. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"to_ensemble = pd.DataFrame({ 'REF_NO': test['REF_NO'], 'Account.Status':pred_Finaltest})\n", | |
"to_ensemble = to_ensemble[['REF_NO', 'Account.Status']]\n", | |
"to_ensemble.to_csv(\"data/subXGB2.csv\", index = False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment