Created
March 11, 2015 04:06
-
-
Save hagino3000/9c8c0b71b6302ca28f25 to your computer and use it in GitHub Desktop.
Kaggle Titanic Competition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 358, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.cross_validation import train_test_split, cross_val_score, KFold\n", | |
| "from sklearn.linear_model import LogisticRegression\n", | |
| "from sklearn.linear_model import LinearRegression\n", | |
| "from sklearn.svm import LinearSVC\n", | |
| "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", | |
| "from sklearn.tree import DecisionTreeClassifier\n", | |
| "from sklearn.ensemble import RandomForestClassifier" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 342, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def plot_confusion_matrix(cm):\n", | |
| " fig, ax = plt.subplots()\n", | |
| " im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n", | |
| " ax.set_title('Confusion Matrix')\n", | |
| " fig.colorbar(im)\n", | |
| "\n", | |
| " target_names = ['not survived', 'survived']\n", | |
| "\n", | |
| " tick_marks = np.arange(len(target_names))\n", | |
| " ax.set_xticks(tick_marks)\n", | |
| " ax.set_xticklabels(target_names, rotation=45)\n", | |
| " ax.set_yticks(tick_marks)\n", | |
| " ax.set_yticklabels(target_names)\n", | |
| " ax.set_ylabel('True label')\n", | |
| " ax.set_xlabel('Predicted label')\n", | |
| " fig.tight_layout()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 343, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_train = pd.read_csv('./train.csv')\n", | |
| "df_test = pd.read_csv('./test.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 344, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Survived</th>\n", | |
| " <th>Pclass</th>\n", | |
| " <th>Name</th>\n", | |
| " <th>Sex</th>\n", | |
| " <th>Age</th>\n", | |
| " <th>SibSp</th>\n", | |
| " <th>Parch</th>\n", | |
| " <th>Ticket</th>\n", | |
| " <th>Fare</th>\n", | |
| " <th>Cabin</th>\n", | |
| " <th>Embarked</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 3</td>\n", | |
| " <td> Braund, Mr. Owen Harris</td>\n", | |
| " <td> male</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> A/5 21171</td>\n", | |
| " <td> 7.2500</td>\n", | |
| " <td> NaN</td>\n", | |
| " <td> S</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
| " <td> female</td>\n", | |
| " <td> 38</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> PC 17599</td>\n", | |
| " <td> 71.2833</td>\n", | |
| " <td> C85</td>\n", | |
| " <td> C</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Survived Pclass Name \\\n", | |
| "0 0 3 Braund, Mr. Owen Harris \n", | |
| "1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", | |
| "\n", | |
| " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", | |
| "0 male 22 1 0 A/5 21171 7.2500 NaN S \n", | |
| "1 female 38 1 0 PC 17599 71.2833 C85 C " | |
| ] | |
| }, | |
| "execution_count": 344, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_train.drop('PassengerId', axis=1, inplace=True)\n", | |
| "df_train.head(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 345, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def _extract_title(name):\n", | |
| " if name.find('Mr.') > 0:\n", | |
| " return 'Mr'\n", | |
| " elif name.find('Mrs.') > 0:\n", | |
| " return 'Mrs'\n", | |
| " elif name.find('Master.') > 0:\n", | |
| " return 'Master'\n", | |
| " elif name.find('Miss.') > 0:\n", | |
| " return 'Miss'\n", | |
| " else:\n", | |
| " return None\n", | |
| " \n", | |
| "def extract_title(df):\n", | |
| " df['Title'] = df.Name.apply(lambda n: _extract_title(n))\n", | |
| " title_bin = pd.get_dummies(df.Title)\n", | |
| " title_bin.rename(columns=lambda x: 'title' + \"_\" + str(x), inplace=True)\n", | |
| " df = df.join(title_bin)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 346, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def fill_fare(df):\n", | |
| " df['Fare'].fillna(0, inplace=True)\n", | |
| " df['FareFill'] = df.Fare\n", | |
| " df.FareFill[(df.Fare == 0) & (df.Pclass == 1)] = 86\n", | |
| " df.FareFill[(df.Fare == 0) & (df.Pclass == 2)] = 21\n", | |
| " df.FareFill[(df.Fare == 0) & (df.Pclass == 3)] = 13\n", | |
| " df.FareFill = df.FareFill.apply(lambda f:np.log(f))\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 347, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def fill_age(df):\n", | |
| " df['AgeFill'] = df.Age\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 40\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 31\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 3.5\n", | |
| " \n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 1)] = 41.5\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 2)] = 32\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 3)] = 31\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 1)] = 30\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 2)] = 24\n", | |
| " df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 3)] = 18\n", | |
| " df.AgeFill[df.AgeFill.isnull() & (df.Sex == 'female')] = 30\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 348, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def extract_pclass(df):\n", | |
| " pclass_new = pd.get_dummies(df.Pclass)\n", | |
| " pclass_new.rename(columns=lambda x: 'pclass' + \"_\" + str(x), inplace=True)\n", | |
| " df = df.join(pclass_new)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 349, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def extract_parch(df):\n", | |
| " dm = pd.get_dummies(df.Parch.apply(lambda p: min(p, 4)))\n", | |
| " dm.rename(columns=lambda x: 'parch' + \"_\" + str(x), inplace=True)\n", | |
| " df = df.join(dm)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 350, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def extract_sibsp(df):\n", | |
| " dm = pd.get_dummies(df.SibSp.apply(lambda s: min(s, 4)))\n", | |
| " dm.rename(columns=lambda x: 'sibsp' + \"_\" + str(x), inplace=True)\n", | |
| " df = df.join(dm)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 351, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def convert_sex(df):\n", | |
| " df['male'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)\n", | |
| " df['female'] = df.Sex.apply(lambda s: 1 if s == 'male' else 0)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 352, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def extract_feature(df):\n", | |
| " df = extract_title(df)\n", | |
| " df = fill_age(df)\n", | |
| " df = extract_pclass(df)\n", | |
| " df = extract_sibsp(df)\n", | |
| " df = extract_parch(df)\n", | |
| " df = convert_sex(df)\n", | |
| " df = fill_fare(df)\n", | |
| " cols = df.columns\n", | |
| " drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))\n", | |
| " return df.drop(drop_cols, axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 353, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_classifier():\n", | |
| " clf = LogisticRegression(C=100, penalty='l2', tol=0.01)\n", | |
| " #clf = RandomForestClassifier()\n", | |
| " #clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2)\n", | |
| " return clf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 354, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def calc_classifier(df, clf=None):\n", | |
| " X_train = extract_feature(df)\n", | |
| " y_train = df['Survived']\n", | |
| " X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=42)\n", | |
| " print('Num of Training Samples: {}'.format(len(X_train)))\n", | |
| " print('Num of Validation Samples: {}'.format(len(X_val)))\n", | |
| " \n", | |
| " if clf is None:\n", | |
| " clf = get_classifier()\n", | |
| " clf.fit(X_train, y_train)\n", | |
| " y_train_pred = clf.predict(X_train)\n", | |
| " y_val_pred = clf.predict(X_val)\n", | |
| " print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))\n", | |
| " print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))\n", | |
| " cm = confusion_matrix(y_val, y_val_pred)\n", | |
| " return clf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 355, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def cross_val(X, y, K, random_state=0, clf=None, ):\n", | |
| " if clf is None:\n", | |
| " clf = get_classifier()\n", | |
| " cv = KFold(len(y), K, shuffle=True, random_state=random_state)\n", | |
| " scores = cross_val_score(clf, X, y, cv=cv)\n", | |
| " print('Scores:', scores)\n", | |
| " print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))\n", | |
| " return scores" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 356, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "X_train = extract_feature(df_train)\n", | |
| "y_train = df_train.Survived" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 389, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Logistic Regression\n", | |
| "('Scores:', array([ 0.78212291, 0.78089888, 0.79213483, 0.8258427 , 0.83707865]))\n", | |
| "Mean Score: 0.804 (+/-0.047)\n", | |
| "Linear Regression\n", | |
| "('Scores:', array([ 0.44844448, 0.38164783, 0.40170421, 0.36806075, 0.48549067]))\n", | |
| "Mean Score: 0.417 (+/-0.087)\n", | |
| "Random Forest\n", | |
| "('Scores:', array([ 0.79888268, 0.79213483, 0.82022472, 0.8258427 , 0.78089888]))\n", | |
| "Mean Score: 0.804 (+/-0.034)\n", | |
| "SVN (L1 regression)\n", | |
| "('Scores:', array([ 0.81005587, 0.82022472, 0.8258427 , 0.81460674, 0.84269663]))\n", | |
| "Mean Score: 0.823 (+/-0.023)\n", | |
| "SVN (L2 regression and L1 loss)\n", | |
| "('Scores:', array([ 0.82122905, 0.82022472, 0.8258427 , 0.8258427 , 0.84269663]))\n", | |
| "Mean Score: 0.827 (+/-0.016)\n", | |
| "SVN (L2)\n", | |
| "('Scores:', array([ 0.82122905, 0.75842697, 0.82022472, 0.85393258, 0.84269663]))\n", | |
| "Mean Score: 0.819 (+/-0.066)\n", | |
| "SVN\n", | |
| "('Scores:', array([ 0.79888268, 0.71910112, 0.75280899, 0.82022472, 0.83707865]))\n", | |
| "Mean Score: 0.786 (+/-0.087)\n", | |
| "Decision Tree\n", | |
| "('Scores:', array([ 0.7877095 , 0.80898876, 0.80898876, 0.80337079, 0.8258427 ]))\n", | |
| "Mean Score: 0.807 (+/-0.024)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([ 0.7877095 , 0.80898876, 0.80898876, 0.80337079, 0.8258427 ])" | |
| ] | |
| }, | |
| "execution_count": 389, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "print \"Logistic Regression\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.1, penalty='l2', tol=0.01))\n", | |
| "print \"Linear Regression\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LinearRegression())\n", | |
| "print \"Random Forest\"\n", | |
| "cross_val(X_train, y_train, 5, clf=RandomForestClassifier())\n", | |
| "print \"SVN (L1 regression)\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l1', dual=False))\n", | |
| "print \"SVN (L2 regression and L1 loss)\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2', loss='l1'))\n", | |
| "print \"SVN (L2)\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2'))\n", | |
| "print \"SVN\"\n", | |
| "cross_val(X_train, y_train, 5, clf=LinearSVC())\n", | |
| "print \"Decision Tree\"\n", | |
| "cross_val(X_train, y_train, 5, clf=DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 384, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Num of Training Samples: 712\n", | |
| "Num of Validation Samples: 179\n", | |
| "Accuracy on Training Set: 0.833\n", | |
| "Accuracy on Validation Set: 0.821\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "clf = calc_classifier(df_train, clf=LinearSVC(penalty='l1', dual=False))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 385, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title_Master</th>\n", | |
| " <th>title_Miss</th>\n", | |
| " <th>title_Mr</th>\n", | |
| " <th>title_Mrs</th>\n", | |
| " <th>AgeFill</th>\n", | |
| " <th>pclass_1</th>\n", | |
| " <th>pclass_2</th>\n", | |
| " <th>pclass_3</th>\n", | |
| " <th>sibsp_0</th>\n", | |
| " <th>sibsp_1</th>\n", | |
| " <th>...</th>\n", | |
| " <th>sibsp_3</th>\n", | |
| " <th>sibsp_4</th>\n", | |
| " <th>parch_0</th>\n", | |
| " <th>parch_1</th>\n", | |
| " <th>parch_2</th>\n", | |
| " <th>parch_3</th>\n", | |
| " <th>parch_4</th>\n", | |
| " <th>male</th>\n", | |
| " <th>female</th>\n", | |
| " <th>FareFill</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1.981001</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 38</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 4.266662</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 26</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 2.070022</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 35</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 3.972177</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 35</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2.085672</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 21 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " title_Master title_Miss title_Mr title_Mrs AgeFill pclass_1 pclass_2 \\\n", | |
| "0 0 0 1 0 22 0 0 \n", | |
| "1 0 0 0 1 38 1 0 \n", | |
| "2 0 1 0 0 26 0 0 \n", | |
| "3 0 0 0 1 35 1 0 \n", | |
| "4 0 0 1 0 35 0 0 \n", | |
| "\n", | |
| " pclass_3 sibsp_0 sibsp_1 ... sibsp_3 sibsp_4 parch_0 parch_1 \\\n", | |
| "0 1 0 1 ... 0 0 1 0 \n", | |
| "1 0 0 1 ... 0 0 1 0 \n", | |
| "2 1 1 0 ... 0 0 1 0 \n", | |
| "3 0 0 1 ... 0 0 1 0 \n", | |
| "4 1 1 0 ... 0 0 1 0 \n", | |
| "\n", | |
| " parch_2 parch_3 parch_4 male female FareFill \n", | |
| "0 0 0 0 0 1 1.981001 \n", | |
| "1 0 0 0 1 0 4.266662 \n", | |
| "2 0 0 0 1 0 2.070022 \n", | |
| "3 0 0 0 1 0 3.972177 \n", | |
| "4 0 0 0 0 1 2.085672 \n", | |
| "\n", | |
| "[5 rows x 21 columns]" | |
| ] | |
| }, | |
| "execution_count": 385, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "X_train.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 386, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "Y = extract_feature(df_test)\n", | |
| "df_test['Survived'] = clf.predict(Y)\n", | |
| "submit_data = df_test[['PassengerId', 'Survived']]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 387, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title_Master</th>\n", | |
| " <th>title_Miss</th>\n", | |
| " <th>title_Mr</th>\n", | |
| " <th>title_Mrs</th>\n", | |
| " <th>AgeFill</th>\n", | |
| " <th>pclass_1</th>\n", | |
| " <th>pclass_2</th>\n", | |
| " <th>pclass_3</th>\n", | |
| " <th>sibsp_0</th>\n", | |
| " <th>sibsp_1</th>\n", | |
| " <th>...</th>\n", | |
| " <th>sibsp_3</th>\n", | |
| " <th>sibsp_4</th>\n", | |
| " <th>parch_0</th>\n", | |
| " <th>parch_1</th>\n", | |
| " <th>parch_2</th>\n", | |
| " <th>parch_3</th>\n", | |
| " <th>parch_4</th>\n", | |
| " <th>male</th>\n", | |
| " <th>female</th>\n", | |
| " <th>FareFill</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 34.5</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2.057860</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 47.0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1.945910</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 62.0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2.270836</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 27.0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 2.159003</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 22.0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td>...</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 1</td>\n", | |
| " <td> 0</td>\n", | |
| " <td> 2.508582</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 21 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " title_Master title_Miss title_Mr title_Mrs AgeFill pclass_1 pclass_2 \\\n", | |
| "0 0 0 1 0 34.5 0 0 \n", | |
| "1 0 0 0 1 47.0 0 0 \n", | |
| "2 0 0 1 0 62.0 0 1 \n", | |
| "3 0 0 1 0 27.0 0 0 \n", | |
| "4 0 0 0 1 22.0 0 0 \n", | |
| "\n", | |
| " pclass_3 sibsp_0 sibsp_1 ... sibsp_3 sibsp_4 parch_0 parch_1 \\\n", | |
| "0 1 1 0 ... 0 0 1 0 \n", | |
| "1 1 0 1 ... 0 0 1 0 \n", | |
| "2 0 1 0 ... 0 0 1 0 \n", | |
| "3 1 1 0 ... 0 0 1 0 \n", | |
| "4 1 0 1 ... 0 0 0 1 \n", | |
| "\n", | |
| " parch_2 parch_3 parch_4 male female FareFill \n", | |
| "0 0 0 0 0 1 2.057860 \n", | |
| "1 0 0 0 1 0 1.945910 \n", | |
| "2 0 0 0 0 1 2.270836 \n", | |
| "3 0 0 0 0 1 2.159003 \n", | |
| "4 0 0 0 1 0 2.508582 \n", | |
| "\n", | |
| "[5 rows x 21 columns]" | |
| ] | |
| }, | |
| "execution_count": 387, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "Y.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 388, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "submit_data.to_csv('./submit_20150312_3.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment