MikeDacre · May 25, 2017 16:34
diff --git a/Filter by multiple columns.ipynb b/Filter by multiple columns.ipynb
 {
  "cells": [
    {
      "metadata": {
        "run_control": {
          "read_only": false,
          "frozen": false
        }
      },
      "cell_type": "markdown",
      "source": "This function allows you to filter any DataFrame to make it unique on multiple columns. It will use a boolean column to filter, and then will sort the results by some other column and return the lowest hit.\n\nThe use case for this is a DataFrame of genetic data with multiple sources of overlapping information, this will allow you to collapse the DataFrame into a single piece of information per datapoint with the lowest p-value being the representative data source.\n\nThe function alone is here: https://gist.github.com/MikeDacre/bce7c7a4fd0d9457db47a3c199c2a45a"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-05-25T09:33:42.670388",
          "end_time": "2017-05-25T09:33:42.785107"
        },
        "collapsed": true,
        "run_control": {
          "frozen": false,
          "read_only": false
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "import pandas as pd",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-05-25T09:33:42.858337",
          "end_time": "2017-05-25T09:33:42.978498"
        },
        "collapsed": false,
        "run_control": {
          "frozen": false,
          "read_only": false
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6):\n    \"\"\"Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col.\n    \n    Picks most frequent result of boolean merge_col if the result represents a portion of the\n    total greater than cutoff. Then sorts the results ascending by sort_col and returns the\n    first row only.\n    \n    If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped.\n    \n    Parameters\n    ----------\n    df: pandas.core.frame.DataFrame\n    group_cols: list_of_str\n        A list of the column names to group by\n    merge_col: str\n        The name of the boolean column to filter\n    sort_col: str\n        The name of the column to sort by when picking top hit\n    \n    Returns\n    -------\n    df: pandas.core.frame.DataFrame\n        DataFrame with the same columns as the starting df, but with rows unique on group_cols.\n    \"\"\"\n    \n    def filter_group(d):\n        \"\"\"Use with groupby().apply(), filter DataFrame by boolean column and make unique.\n\n        Will sort ascending by sort_col and return the first result.\n        \"\"\"\n        match_info = d[merge_col].value_counts().to_dict()\n        # Pick best match result if it represents more than 60%\n        if len(match_info) == 1:\n            match_result = list(match_info.keys())[0]\n        else:\n            if (max(match_info[True], match_info[False])/\n                (match_info[True]+match_info[False])) < cutoff:\n                match_result = None\n            else:\n                if match_info[True] > match_info[False]:\n                    match_result = True\n                else:\n                    match_result = False\n        if match_result is None:\n            # Drop all somehow, e.g.\n            return pd.DataFrame()\n        new_d = d[d[merge_col] == match_result]\n        new_d.sort_values(sort_col, ascending=True, inplace=True)\n        # Return the one row with the lowest p as a dataframe\n        return new_d.iloc[0:1]\n\n    cols = df.columns\n    res = df.groupby(group_cols).apply(filter_group)\n    return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols]",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-05-25T09:33:43.129528",
          "end_time": "2017-05-25T09:33:43.216959"
        },
        "collapsed": true,
        "run_control": {
          "frozen": false,
          "read_only": false
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.DataFrame(\n    [['n1', 'Height', 'b1', 1.23e-6, True],\n     ['n1', 'Height', 'b2', 3e-2, False],\n     ['n1', 'Height', 'b3', 2.16e-2, True],\n     ['n2', 'BMI', 'b2', 3.2e-2, True],\n     ['n2', 'BMI', 'b6', 1.2e-3, True],\n     ['n3', 'Height', 'b4', 0.126, False],\n     ['n3', 'Height', 'b6', 0.026, False],\n     ['n3', 'Height', 'b7', 1.43e-3, False],\n     ['n3', 'Height', 'b8', 2.41e-2, True],\n     ['n3', 'Height', 'b9', 0.987, False],\n     ['n3', 'BMI', 'b1', 1.2e-6, True],\n     ['n3', 'BMI', 'b2', 1.43e-5, True],\n     ['n3', 'BMI', 'b3', 1.43e-4, True],\n     ['n3', 'BMI', 'b6', 0.00216, False],\n     ['n4', 'BMI', 'b3', 1.43e-4, True],\n     ['n4', 'BMI', 'b6', 0.00216, False],\n    ],\n    columns=['name', 'trait', 'proxy', 'p', 'match']\n)\ndf['t'] = 'hi'",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-05-25T09:34:14.724027",
          "end_time": "2017-05-25T09:34:14.944459"
        },
        "collapsed": false,
        "run_control": {
          "frozen": false,
          "read_only": false
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "run_filter(df, ['name', 'trait'], 'match', 'p')",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": "   name   trait proxy         p  match   t\n0    n1  Height    b1  0.000001   True  hi\n4    n2     BMI    b6  0.001200   True  hi\n10   n3     BMI    b1  0.000001   True  hi\n7    n3  Height    b7  0.001430  False  hi",
            "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>trait</th>\n      <th>proxy</th>\n      <th>p</th>\n      <th>match</th>\n      <th>t</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>n1</td>\n      <td>Height</td>\n      <td>b1</td>\n      <td>0.000001</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>n2</td>\n      <td>BMI</td>\n      <td>b6</td>\n      <td>0.001200</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>n3</td>\n      <td>BMI</td>\n      <td>b1</td>\n      <td>0.000001</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b7</td>\n      <td>0.001430</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-05-25T09:34:17.672520",
          "end_time": "2017-05-25T09:34:17.770001"
        },
        "collapsed": false,
        "run_control": {
          "frozen": false,
          "read_only": false
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": "   name   trait proxy         p  match   t\n0    n1  Height    b1  0.000001   True  hi\n1    n1  Height    b2  0.030000  False  hi\n2    n1  Height    b3  0.021600   True  hi\n3    n2     BMI    b2  0.032000   True  hi\n4    n2     BMI    b6  0.001200   True  hi\n5    n3  Height    b4  0.126000  False  hi\n6    n3  Height    b6  0.026000  False  hi\n7    n3  Height    b7  0.001430  False  hi\n8    n3  Height    b8  0.024100   True  hi\n9    n3  Height    b9  0.987000  False  hi\n10   n3     BMI    b1  0.000001   True  hi\n11   n3     BMI    b2  0.000014   True  hi\n12   n3     BMI    b3  0.000143   True  hi\n13   n3     BMI    b6  0.002160  False  hi\n14   n4     BMI    b3  0.000143   True  hi\n15   n4     BMI    b6  0.002160  False  hi",
            "text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>trait</th>\n      <th>proxy</th>\n      <th>p</th>\n      <th>match</th>\n      <th>t</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>n1</td>\n      <td>Height</td>\n      <td>b1</td>\n      <td>0.000001</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>n1</td>\n      <td>Height</td>\n      <td>b2</td>\n      <td>0.030000</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>n1</td>\n      <td>Height</td>\n      <td>b3</td>\n      <td>0.021600</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>n2</td>\n      <td>BMI</td>\n      <td>b2</td>\n      <td>0.032000</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>n2</td>\n      <td>BMI</td>\n      <td>b6</td>\n      <td>0.001200</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b4</td>\n      <td>0.126000</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b6</td>\n      <td>0.026000</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b7</td>\n      <td>0.001430</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b8</td>\n      <td>0.024100</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>n3</td>\n      <td>Height</td>\n      <td>b9</td>\n      <td>0.987000</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>n3</td>\n      <td>BMI</td>\n      <td>b1</td>\n      <td>0.000001</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>n3</td>\n      <td>BMI</td>\n      <td>b2</td>\n      <td>0.000014</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>n3</td>\n      <td>BMI</td>\n      <td>b3</td>\n      <td>0.000143</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>n3</td>\n      <td>BMI</td>\n      <td>b6</td>\n      <td>0.002160</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>n4</td>\n      <td>BMI</td>\n      <td>b3</td>\n      <td>0.000143</td>\n      <td>True</td>\n      <td>hi</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>n4</td>\n      <td>BMI</td>\n      <td>b6</td>\n      <td>0.002160</td>\n      <td>False</td>\n      <td>hi</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    }
  ],
  "metadata": {
    "_draft": {
      "nbviewer_url": "https://gist.github.com/17526340291d56d66210c9a6315f29bb"
    },
    "gist": {
      "id": "17526340291d56d66210c9a6315f29bb",
      "data": {
        "description": "Make a pandas DataFrame unique on multiple columns while preserving other columns and filtering",
        "public": true
      }
    },
    "hide_input": false,
    "kernelspec": {
      "name": "anaconda3",
      "display_name": "anaconda",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.0",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "toc": {
      "threshold": 4,
      "number_sections": true,
      "toc_cell": false,
      "toc_window_display": false,
      "toc_section_display": "block",
      "sideBar": true,
      "navigate_menu": true,
      "moveMenuLeft": true,
      "colors": {
        "hover_highlight": "#DAA520",
        "selected_highlight": "#FFD700",
        "running_highlight": "#FF0000"
      },
      "nav_menu": {
        "height": "12px",
        "width": "252px"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"metadata": {
	"run_control": {
	"read_only": false,
	"frozen": false
	}
	},
	"cell_type": "markdown",
	"source": "This function allows you to filter any DataFrame to make it unique on multiple columns. It will use a boolean column to filter, and then will sort the results by some other column and return the lowest hit.\n\nThe use case for this is a DataFrame of genetic data with multiple sources of overlapping information, this will allow you to collapse the DataFrame into a single piece of information per datapoint with the lowest p-value being the representative data source.\n\nThe function alone is here: https://gist.github.com/MikeDacre/bce7c7a4fd0d9457db47a3c199c2a45a"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-05-25T09:33:42.670388",
	"end_time": "2017-05-25T09:33:42.785107"
	},
	"collapsed": true,
	"run_control": {
	"frozen": false,
	"read_only": false
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "import pandas as pd",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-05-25T09:33:42.858337",
	"end_time": "2017-05-25T09:33:42.978498"
	},
	"collapsed": false,
	"run_control": {
	"frozen": false,
	"read_only": false
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6):\n \"\"\"Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col.\n \n Picks most frequent result of boolean merge_col if the result represents a portion of the\n total greater than cutoff. Then sorts the results ascending by sort_col and returns the\n first row only.\n \n If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped.\n \n Parameters\n ----------\n df: pandas.core.frame.DataFrame\n group_cols: list_of_str\n A list of the column names to group by\n merge_col: str\n The name of the boolean column to filter\n sort_col: str\n The name of the column to sort by when picking top hit\n \n Returns\n -------\n df: pandas.core.frame.DataFrame\n DataFrame with the same columns as the starting df, but with rows unique on group_cols.\n \"\"\"\n \n def filter_group(d):\n \"\"\"Use with groupby().apply(), filter DataFrame by boolean column and make unique.\n\n Will sort ascending by sort_col and return the first result.\n \"\"\"\n match_info = d[merge_col].value_counts().to_dict()\n # Pick best match result if it represents more than 60%\n if len(match_info) == 1:\n match_result = list(match_info.keys())[0]\n else:\n if (max(match_info[True], match_info[False])/\n (match_info[True]+match_info[False])) < cutoff:\n match_result = None\n else:\n if match_info[True] > match_info[False]:\n match_result = True\n else:\n match_result = False\n if match_result is None:\n # Drop all somehow, e.g.\n return pd.DataFrame()\n new_d = d[d[merge_col] == match_result]\n new_d.sort_values(sort_col, ascending=True, inplace=True)\n # Return the one row with the lowest p as a dataframe\n return new_d.iloc[0:1]\n\n cols = df.columns\n res = df.groupby(group_cols).apply(filter_group)\n return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols]",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-05-25T09:33:43.129528",
	"end_time": "2017-05-25T09:33:43.216959"
	},
	"collapsed": true,
	"run_control": {
	"frozen": false,
	"read_only": false
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "df = pd.DataFrame(\n [['n1', 'Height', 'b1', 1.23e-6, True],\n ['n1', 'Height', 'b2', 3e-2, False],\n ['n1', 'Height', 'b3', 2.16e-2, True],\n ['n2', 'BMI', 'b2', 3.2e-2, True],\n ['n2', 'BMI', 'b6', 1.2e-3, True],\n ['n3', 'Height', 'b4', 0.126, False],\n ['n3', 'Height', 'b6', 0.026, False],\n ['n3', 'Height', 'b7', 1.43e-3, False],\n ['n3', 'Height', 'b8', 2.41e-2, True],\n ['n3', 'Height', 'b9', 0.987, False],\n ['n3', 'BMI', 'b1', 1.2e-6, True],\n ['n3', 'BMI', 'b2', 1.43e-5, True],\n ['n3', 'BMI', 'b3', 1.43e-4, True],\n ['n3', 'BMI', 'b6', 0.00216, False],\n ['n4', 'BMI', 'b3', 1.43e-4, True],\n ['n4', 'BMI', 'b6', 0.00216, False],\n ],\n columns=['name', 'trait', 'proxy', 'p', 'match']\n)\ndf['t'] = 'hi'",
	"execution_count": 3,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-05-25T09:34:14.724027",
	"end_time": "2017-05-25T09:34:14.944459"
	},
	"collapsed": false,
	"run_control": {
	"frozen": false,
	"read_only": false
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "run_filter(df, ['name', 'trait'], 'match', 'p')",
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": " name trait proxy p match t\n0 n1 Height b1 0.000001 True hi\n4 n2 BMI b6 0.001200 True hi\n10 n3 BMI b1 0.000001 True hi\n7 n3 Height b7 0.001430 False hi",
	"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>trait</th>\n <th>proxy</th>\n <th>p</th>\n <th>match</th>\n <th>t</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>n1</td>\n <td>Height</td>\n <td>b1</td>\n <td>0.000001</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>4</th>\n <td>n2</td>\n <td>BMI</td>\n <td>b6</td>\n <td>0.001200</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>10</th>\n <td>n3</td>\n <td>BMI</td>\n <td>b1</td>\n <td>0.000001</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>7</th>\n <td>n3</td>\n <td>Height</td>\n <td>b7</td>\n <td>0.001430</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n </tbody>\n</table>\n</div>"
	},
	"metadata": {},
	"execution_count": 5
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-05-25T09:34:17.672520",
	"end_time": "2017-05-25T09:34:17.770001"
	},
	"collapsed": false,
	"run_control": {
	"frozen": false,
	"read_only": false
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "df",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": " name trait proxy p match t\n0 n1 Height b1 0.000001 True hi\n1 n1 Height b2 0.030000 False hi\n2 n1 Height b3 0.021600 True hi\n3 n2 BMI b2 0.032000 True hi\n4 n2 BMI b6 0.001200 True hi\n5 n3 Height b4 0.126000 False hi\n6 n3 Height b6 0.026000 False hi\n7 n3 Height b7 0.001430 False hi\n8 n3 Height b8 0.024100 True hi\n9 n3 Height b9 0.987000 False hi\n10 n3 BMI b1 0.000001 True hi\n11 n3 BMI b2 0.000014 True hi\n12 n3 BMI b3 0.000143 True hi\n13 n3 BMI b6 0.002160 False hi\n14 n4 BMI b3 0.000143 True hi\n15 n4 BMI b6 0.002160 False hi",
	"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>trait</th>\n <th>proxy</th>\n <th>p</th>\n <th>match</th>\n <th>t</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>n1</td>\n <td>Height</td>\n <td>b1</td>\n <td>0.000001</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>1</th>\n <td>n1</td>\n <td>Height</td>\n <td>b2</td>\n <td>0.030000</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>2</th>\n <td>n1</td>\n <td>Height</td>\n <td>b3</td>\n <td>0.021600</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>3</th>\n <td>n2</td>\n <td>BMI</td>\n <td>b2</td>\n <td>0.032000</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>4</th>\n <td>n2</td>\n <td>BMI</td>\n <td>b6</td>\n <td>0.001200</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>5</th>\n <td>n3</td>\n <td>Height</td>\n <td>b4</td>\n <td>0.126000</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>6</th>\n <td>n3</td>\n <td>Height</td>\n <td>b6</td>\n <td>0.026000</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>7</th>\n <td>n3</td>\n <td>Height</td>\n <td>b7</td>\n <td>0.001430</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>8</th>\n <td>n3</td>\n <td>Height</td>\n <td>b8</td>\n <td>0.024100</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>9</th>\n <td>n3</td>\n <td>Height</td>\n <td>b9</td>\n <td>0.987000</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>10</th>\n <td>n3</td>\n <td>BMI</td>\n <td>b1</td>\n <td>0.000001</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>11</th>\n <td>n3</td>\n <td>BMI</td>\n <td>b2</td>\n <td>0.000014</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>12</th>\n <td>n3</td>\n <td>BMI</td>\n <td>b3</td>\n <td>0.000143</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>13</th>\n <td>n3</td>\n <td>BMI</td>\n <td>b6</td>\n <td>0.002160</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>14</th>\n <td>n4</td>\n <td>BMI</td>\n <td>b3</td>\n <td>0.000143</td>\n <td>True</td>\n <td>hi</td>\n </tr>\n <tr>\n <th>15</th>\n <td>n4</td>\n <td>BMI</td>\n <td>b6</td>\n <td>0.002160</td>\n <td>False</td>\n <td>hi</td>\n </tr>\n </tbody>\n</table>\n</div>"
	},
	"metadata": {},
	"execution_count": 6
	}
	]
	}
	],
	"metadata": {
	"_draft": {
	"nbviewer_url": "https://gist.github.com/17526340291d56d66210c9a6315f29bb"
	},
	"gist": {
	"id": "17526340291d56d66210c9a6315f29bb",
	"data": {
	"description": "Make a pandas DataFrame unique on multiple columns while preserving other columns and filtering",
	"public": true
	}
	},
	"hide_input": false,
	"kernelspec": {
	"name": "anaconda3",
	"display_name": "anaconda",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.6.0",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"toc": {
	"threshold": 4,
	"number_sections": true,
	"toc_cell": false,
	"toc_window_display": false,
	"toc_section_display": "block",
	"sideBar": true,
	"navigate_menu": true,
	"moveMenuLeft": true,
	"colors": {
	"hover_highlight": "#DAA520",
	"selected_highlight": "#FFD700",
	"running_highlight": "#FF0000"
	},
	"nav_menu": {
	"height": "12px",
	"width": "252px"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}