Created
March 20, 2014 12:50
-
-
Save minya/9663080 to your computer and use it in GitHub Desktop.
Contest Brel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "contest" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import pandas as pd", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 73 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def dummify2(df, columns):\n for column in columns:\n dummies = pd.get_dummies(df[column], prefix=column)\n df = df.drop(column, axis=1)\n df = df.join(dummies)\n return df", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 74 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def convert_class(cls):\n if cls.find('+') >=0:\n return 1\n else:\n return 0", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 75 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df = pd.read_csv('income_train.csv')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 76 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df.CLASS = df.CLASS.apply(convert_class)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 77 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "\u0421\u043e\u0441\u0442\u0430\u0432\u0438\u043c \u0441\u043f\u0438\u0441\u043e\u043a \u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0430\u043b\u044c\u043d\u044b\u0445 \u043a\u043e\u043b\u043e\u043d\u043e\u043a" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "categorial = []\ncnt = 0\nfor t in df.dtypes:\n if (t == 'object'):\n categorial.append(df.columns[cnt])\n cnt += 1", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 78 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "cat_not_str = []\nfor column in df.columns:\n if column not in categorial and column != 'CLASS':\n desc = df.loc[:,[column]].describe()\n if len(pd.unique(df.loc[:,[column]].values)) <= 7:\n cat_not_str.append(column)\n ", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 79 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df2 = dummify2(df, categorial + cat_not_str)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 82 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "\u041f\u043e\u0434\u0431\u0435\u0440\u0435\u043c \u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440 max_depth \u0434\u043b\u044f \u043e\u0431\u0443\u0447\u0430\u044e\u0449\u0435\u0433\u043e \u0434\u0435\u0440\u0435\u0432\u0430" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from sklearn import tree\nfrom sklearn import cross_validation\nfrom sklearn.preprocessing import scale\ndef preidict_with_tree(max_depth):\n X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(scale(df2.drop('CLASS', axis=1).values), df2.CLASS, test_size=1/4)\n clf = tree.DecisionTreeClassifier(max_depth=max_depth)\n clf = clf.fit(X_train, Y_train)\n Y_predicted = clf.predict_proba(X_test)\n from sklearn.metrics import mean_absolute_error as MAE\n e = MAE(Y_test, Y_predicted[:, 1])\n print(\"Error: {0}\".format(e))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 239 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from sklearn.grid_search import GridSearchCV\nsearch = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=[{'max_depth' : list(range(7, 25, 1))} ])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 305 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search.fit(X_train, Y_train)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 306, | |
"text": "GridSearchCV(cv=None,\n estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',\n max_depth=None, max_features=None, min_density=None,\n min_samples_leaf=1, min_samples_split=2, random_state=None,\n splitter='best'),\n fit_params={}, iid=True, loss_func=None, n_jobs=1,\n param_grid=[{'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]}],\n pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n verbose=0)" | |
} | |
], | |
"prompt_number": 306 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search.best_params_", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 307, | |
"text": "{'max_depth': 8}" | |
} | |
], | |
"prompt_number": 307 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "preidict_with_tree(15)\n# preidict_with_tree(15)\n# preidict_with_tree(17)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Error: 0.1781860220338418\n" | |
} | |
], | |
"prompt_number": 304 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "GridSearchCV \u0433\u043e\u0432\u043e\u0440\u0438\u0442, \u0447\u0442\u043e 8, \u043d\u043e \u044f \u043f\u043e\u0442\u044b\u043a\u0430\u043b \u0440\u0443\u043a\u0430\u043c\u0438 \u0438 \u0443 \u043c\u0435\u043d\u044f \u043f\u043e\u043b\u0443\u0447\u0438\u043b\u043e\u0441\u044c, \u0447\u0442\u043e 15 \u043b\u0443\u0447\u0448\u0435 (WTF?)" | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "\u041f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c, \u043b\u0443\u0447\u0448\u0435 \u043b\u0438 KNN" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from sklearn.neighbors import KNeighborsClassifier as KNN\nfrom sklearn.grid_search import GridSearchCV\nsearch = GridSearchCV(KNN(weights='distance'),param_grid=[{'n_neighbors' : list(range(1, 25, 5))} ])\nsearch = search.fit(X_train, Y_train)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 83 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search.best_params_", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 84, | |
"text": "{'n_neighbors': 21}" | |
} | |
], | |
"prompt_number": 84 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search = GridSearchCV(KNN(weights='distance'),param_grid=[{'n_neighbors' : list(range(17, 25, 3))} ])\nsearch.fit(X_train, Y_train)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 85, | |
"text": "GridSearchCV(cv=None,\n estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n n_neighbors=5, p=2, weights='distance'),\n fit_params={}, iid=True, loss_func=None, n_jobs=1,\n param_grid=[{'n_neighbors': [21, 23]}], pre_dispatch='2*n_jobs',\n refit=True, score_func=None, scoring=None, verbose=0)" | |
} | |
], | |
"prompt_number": 85 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search.best_params_", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 86, | |
"text": "{'n_neighbors': 23}" | |
} | |
], | |
"prompt_number": 86 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search = GridSearchCV(KNN(weights='distance'),param_grid=[{'n_neighbors' : list(range(21, 23, 1))} ])\nsearch.fit(X_train, Y_train)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 87, | |
"text": "GridSearchCV(cv=None,\n estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n n_neighbors=5, p=2, weights='distance'),\n fit_params={}, iid=True, loss_func=None, n_jobs=1,\n param_grid=[{'n_neighbors': [21, 22]}], pre_dispatch='2*n_jobs',\n refit=True, score_func=None, scoring=None, verbose=0)" | |
} | |
], | |
"prompt_number": 87 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "search.best_params_", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 88, | |
"text": "{'n_neighbors': 22}" | |
} | |
], | |
"prompt_number": 88 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from sklearn import cross_validation\ndef preidict_with_knn(nn):\n X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(scale(df2.drop('CLASS', axis=1).values), df2.CLASS, test_size=1/4)\n clf = KNN(n_neighbors=nn, weights='distance')\n clf = clf.fit(X_train, Y_train)\n Y_predicted = clf.predict_proba(X_test)\n from sklearn.metrics import mean_absolute_error as MAE\n e = MAE(Y_test, Y_predicted[:, 1])\n print(\"Error: {0}\".format(e))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 93 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "preidict_with_knn(25)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Error: 0.2713978826288885\n" | |
} | |
], | |
"prompt_number": 94 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "preidict_with_knn(22)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Error: 0.26926506682476214\n" | |
} | |
], | |
"prompt_number": 96 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "preidict_with_knn(19)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Error: 0.2730720793439653\n" | |
} | |
], | |
"prompt_number": 99 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "\u0417\u0434\u0435\u0441\u044c \u0443\u0436\u0435 \u0431\u043b\u0438\u0436\u0435 \u043a \u0438\u0441\u0442\u0438\u043d\u0435. \u041d\u043e \u0434\u0435\u0440\u0435\u0432\u044c\u044f \u043b\u0443\u0447\u0448\u0435. \u041f\u043e\u044d\u0442\u043e\u043c\u0443, \u043f\u0440\u0438\u043c\u0435\u043d\u044f\u0435\u043c \u0434\u0435\u0440\u0435\u0432\u044c\u044f." | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df_train = pd.read_csv('income_train.csv')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 61 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df_test = pd.read_csv('income_test.csv')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 62 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df_joined = pd.concat([df_train.drop('CLASS', axis=1), df_test.drop('ID', axis=1)]) ", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 63 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def dummify3(df, columns):\n for column in columns:\n dummies = pd.get_dummies(df[column], prefix=column)\n df = pd.concat([df, dummies], axis=1)\n del df[column]\n return df", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 64 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "X_joined = dummify3(df_joined, categorial + cat_not_str)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "X_joined.shape", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 66, | |
"text": "(14800, 386)" | |
} | |
], | |
"prompt_number": 66 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "X_joined = scale(X_joined)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 67 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "X_train = X_joined[:len(df_train), :]\nY_train = df_train.CLASS.apply(convert_class).values.astype(np.float32)\nX_test = X_joined[len(df_train):, :]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 68 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "clf = tree.DecisionTreeClassifier(max_depth=15)\nclf = clf.fit(X_train, Y_train)\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 69 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "Y_predicted = clf.predict_proba(X_test)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 70 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "df_test['Prediction'] = Y_predicted[:, 1]\ndf_test.loc[:, ['ID', 'Prediction']].to_csv('prediction.csv', index=False)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/site-packages/pandas/core/internals.py:1190: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n values = values[:, slicer]\n/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/site-packages/pandas/core/internals.py:552: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n values = values[:, slicer]\n/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/site-packages/pandas/core/index.py:624: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n result = arr_idx[key]\n" | |
} | |
], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 71 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment