Skip to content

Instantly share code, notes, and snippets.

@dansondergaard
Created November 16, 2017 12:33

Revisions

  1. dansondergaard created this gist Nov 16, 2017.
    347 changes: 347 additions & 0 deletions Pandas DataFrame Views.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,347 @@
    {
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "# Pandas DataFrame Views"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "import numpy as np\n",
    "import pandas as pd"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "text/html": [
    "<div>\n",
    "<style>\n",
    " .dataframe thead tr:only-child th {\n",
    " text-align: right;\n",
    " }\n",
    "\n",
    " .dataframe thead th {\n",
    " text-align: left;\n",
    " }\n",
    "\n",
    " .dataframe tbody tr th {\n",
    " vertical-align: top;\n",
    " }\n",
    "</style>\n",
    "<table border=\"1\" class=\"dataframe\">\n",
    " <thead>\n",
    " <tr style=\"text-align: right;\">\n",
    " <th></th>\n",
    " <th>A</th>\n",
    " <th>B</th>\n",
    " </tr>\n",
    " </thead>\n",
    " <tbody>\n",
    " <tr>\n",
    " <th>0</th>\n",
    " <td>0.557771</td>\n",
    " <td>0.260547</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>1</th>\n",
    " <td>0.016835</td>\n",
    " <td>0.656023</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>2</th>\n",
    " <td>0.564032</td>\n",
    " <td>0.094023</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>3</th>\n",
    " <td>0.163993</td>\n",
    " <td>0.473556</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>4</th>\n",
    " <td>0.026951</td>\n",
    " <td>0.899029</td>\n",
    " </tr>\n",
    " </tbody>\n",
    "</table>\n",
    "</div>"
    ],
    "text/plain": [
    " A B\n",
    "0 0.557771 0.260547\n",
    "1 0.016835 0.656023\n",
    "2 0.564032 0.094023\n",
    "3 0.163993 0.473556\n",
    "4 0.026951 0.899029"
    ]
    },
    "execution_count": 2,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "df = pd.DataFrame(np.random.rand(100, 2), columns=['A', 'B'])\n",
    "df.head()"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "This doesn't work because the first line returns a view of the DataFrame, not a copy, and the behavior of assignment to views is undefined."
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 32,
    "metadata": {},
    "outputs": [
    {
    "name": "stderr",
    "output_type": "stream",
    "text": [
    "/Users/das/Anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
    "A value is trying to be set on a copy of a slice from a DataFrame.\n",
    "Try using .loc[row_indexer,col_indexer] = value instead\n",
    "\n",
    "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
    " \n"
    ]
    }
    ],
    "source": [
    "df_new = df[df.A < 0.5]\n",
    "df_new['C'] = df_new['B'] + 1"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "If we explicitly specify the columns and use `.loc[]` we get a copy which we can assign to:"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 31,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "text/html": [
    "<div>\n",
    "<style>\n",
    " .dataframe thead tr:only-child th {\n",
    " text-align: right;\n",
    " }\n",
    "\n",
    " .dataframe thead th {\n",
    " text-align: left;\n",
    " }\n",
    "\n",
    " .dataframe tbody tr th {\n",
    " vertical-align: top;\n",
    " }\n",
    "</style>\n",
    "<table border=\"1\" class=\"dataframe\">\n",
    " <thead>\n",
    " <tr style=\"text-align: right;\">\n",
    " <th></th>\n",
    " <th>A</th>\n",
    " <th>B</th>\n",
    " <th>C</th>\n",
    " </tr>\n",
    " </thead>\n",
    " <tbody>\n",
    " <tr>\n",
    " <th>1</th>\n",
    " <td>0.016835</td>\n",
    " <td>0.656023</td>\n",
    " <td>1.656023</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>3</th>\n",
    " <td>0.163993</td>\n",
    " <td>0.473556</td>\n",
    " <td>1.473556</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>4</th>\n",
    " <td>0.026951</td>\n",
    " <td>0.899029</td>\n",
    " <td>1.899029</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>5</th>\n",
    " <td>0.145798</td>\n",
    " <td>0.533323</td>\n",
    " <td>1.533323</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>6</th>\n",
    " <td>0.448204</td>\n",
    " <td>0.139057</td>\n",
    " <td>1.139057</td>\n",
    " </tr>\n",
    " </tbody>\n",
    "</table>\n",
    "</div>"
    ],
    "text/plain": [
    " A B C\n",
    "1 0.016835 0.656023 1.656023\n",
    "3 0.163993 0.473556 1.473556\n",
    "4 0.026951 0.899029 1.899029\n",
    "5 0.145798 0.533323 1.533323\n",
    "6 0.448204 0.139057 1.139057"
    ]
    },
    "execution_count": 31,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "df_new = df.loc[df.A < 0.5, ['A', 'B']]\n",
    "df_new['C'] = df_new['B'] + 1\n",
    "df_new.head()"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "Alternatively one can use the `assign()` method to assign a new column to the DataFrame. This is also shorter:"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 28,
    "metadata": {},
    "outputs": [
    {
    "data": {
    "text/html": [
    "<div>\n",
    "<style>\n",
    " .dataframe thead tr:only-child th {\n",
    " text-align: right;\n",
    " }\n",
    "\n",
    " .dataframe thead th {\n",
    " text-align: left;\n",
    " }\n",
    "\n",
    " .dataframe tbody tr th {\n",
    " vertical-align: top;\n",
    " }\n",
    "</style>\n",
    "<table border=\"1\" class=\"dataframe\">\n",
    " <thead>\n",
    " <tr style=\"text-align: right;\">\n",
    " <th></th>\n",
    " <th>A</th>\n",
    " <th>B</th>\n",
    " <th>C</th>\n",
    " </tr>\n",
    " </thead>\n",
    " <tbody>\n",
    " <tr>\n",
    " <th>1</th>\n",
    " <td>0.016835</td>\n",
    " <td>0.656023</td>\n",
    " <td>1.656023</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>3</th>\n",
    " <td>0.163993</td>\n",
    " <td>0.473556</td>\n",
    " <td>1.473556</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>4</th>\n",
    " <td>0.026951</td>\n",
    " <td>0.899029</td>\n",
    " <td>1.899029</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>5</th>\n",
    " <td>0.145798</td>\n",
    " <td>0.533323</td>\n",
    " <td>1.533323</td>\n",
    " </tr>\n",
    " <tr>\n",
    " <th>6</th>\n",
    " <td>0.448204</td>\n",
    " <td>0.139057</td>\n",
    " <td>1.139057</td>\n",
    " </tr>\n",
    " </tbody>\n",
    "</table>\n",
    "</div>"
    ],
    "text/plain": [
    " A B C\n",
    "1 0.016835 0.656023 1.656023\n",
    "3 0.163993 0.473556 1.473556\n",
    "4 0.026951 0.899029 1.899029\n",
    "5 0.145798 0.533323 1.533323\n",
    "6 0.448204 0.139057 1.139057"
    ]
    },
    "execution_count": 28,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "df_new = df[df.A < 0.5].assign(C=lambda x: x.B + 1)\n",
    "df_new.head()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.3"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }