benadaba · August 21, 2016 21:23
diff --git a/Customer Segmentation in Python.ipynb b/Customer Segmentation in Python.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>offer_id</th>\n",
       "      <th>campaign</th>\n",
       "      <th>varietal</th>\n",
       "      <th>min_qty</th>\n",
       "      <th>discount</th>\n",
       "      <th>origin</th>\n",
       "      <th>past_peak</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>January</td>\n",
       "      <td>Malbec</td>\n",
       "      <td>72</td>\n",
       "      <td>56</td>\n",
       "      <td>France</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>January</td>\n",
       "      <td>Pinot Noir</td>\n",
       "      <td>72</td>\n",
       "      <td>17</td>\n",
       "      <td>France</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>February</td>\n",
       "      <td>Espumante</td>\n",
       "      <td>144</td>\n",
       "      <td>32</td>\n",
       "      <td>Oregon</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>February</td>\n",
       "      <td>Champagne</td>\n",
       "      <td>72</td>\n",
       "      <td>48</td>\n",
       "      <td>France</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>February</td>\n",
       "      <td>Cabernet Sauvignon</td>\n",
       "      <td>144</td>\n",
       "      <td>44</td>\n",
       "      <td>New Zealand</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   offer_id  campaign            varietal  min_qty  discount       origin  \\\n",
       "0         1   January              Malbec       72        56       France   \n",
       "1         2   January          Pinot Noir       72        17       France   \n",
       "2         3  February           Espumante      144        32       Oregon   \n",
       "3         4  February           Champagne       72        48       France   \n",
       "4         5  February  Cabernet Sauvignon      144        44  New Zealand   \n",
       "\n",
       "  past_peak  \n",
       "0     False  \n",
       "1     False  \n",
       "2      True  \n",
       "3      True  \n",
       "4      True  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import wget\n",
    "import os\n",
    "\n",
    "if (not os.path.isfile(\"WineKMC.xlsx\")):\n",
    "    wget.download(\"http://blog.yhathq.com/static/misc/data/WineKMC.xlsx\")\n",
    "\n",
    "df_offers = pd.read_excel(\"WineKMC.xlsx\", sheetname=0)\n",
    "df_offers.columns = [\"offer_id\", \"campaign\", \"varietal\", \"min_qty\", \"discount\", \"origin\", \"past_peak\"]\n",
    "df_offers.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_name</th>\n",
       "      <th>offer_id</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Smith</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Smith</td>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Johnson</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Johnson</td>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Johnson</td>\n",
       "      <td>26</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  customer_name  offer_id  n\n",
       "0         Smith         2  1\n",
       "1         Smith        24  1\n",
       "2       Johnson        17  1\n",
       "3       Johnson        24  1\n",
       "4       Johnson        26  1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_transactions = pd.read_excel(\"/Users/bob/Development/WineKMC.xlsx\", sheetname=1)\n",
    "df_transactions.columns = [\"customer_name\", \"offer_id\"]\n",
    "df_transactions['n'] = 1\n",
    "df_transactions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# join the offers and transactions table\n",
    "df = pd.merge(df_offers, df_transactions)\n",
    "# create a \"pivot table\" which will give us the number of times each \n",
    "# customer responded to a given variable\n",
    "matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')\n",
    "# a little tidying up. fill NA values with 0 and make the index into a column\n",
    "matrix = matrix.fillna(0).reset_index()\n",
    "x_cols = matrix.columns[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    34\n",
       "0    21\n",
       "3    17\n",
       "1    16\n",
       "4    12\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cluster = KMeans(n_clusters=5)\n",
    "# slice matrix so we only include the 0/1 indicator columns in the clustering\n",
    "matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[2:]])\n",
    "matrix.cluster.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pca = PCA(n_components=2)\n",
    "matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]\n",
    "matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]\n",
    "matrix = matrix.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>offer_id</th>\n",
       "      <th>customer_name</th>\n",
       "      <th>cluster</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adams</td>\n",
       "      <td>3</td>\n",
       "      <td>-1.007580</td>\n",
       "      <td>0.108215</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Allen</td>\n",
       "      <td>2</td>\n",
       "      <td>0.287539</td>\n",
       "      <td>0.044715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Anderson</td>\n",
       "      <td>1</td>\n",
       "      <td>0.392032</td>\n",
       "      <td>1.038391</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Bailey</td>\n",
       "      <td>4</td>\n",
       "      <td>-0.699477</td>\n",
       "      <td>-0.022542</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Baker</td>\n",
       "      <td>4</td>\n",
       "      <td>-0.088183</td>\n",
       "      <td>-0.471695</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "offer_id customer_name  cluster         x         y\n",
       "0                Adams        3 -1.007580  0.108215\n",
       "1                Allen        2  0.287539  0.044715\n",
       "2             Anderson        1  0.392032  1.038391\n",
       "3               Bailey        4 -0.699477 -0.022542\n",
       "4                Baker        4 -0.088183 -0.471695"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]\n",
    "customer_clusters.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = pd.merge(df_transactions, customer_clusters)\n",
    "df = pd.merge(df_offers, df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "is_4                     \n",
       "False  Champagne             79\n",
       "       Pinot Noir            44\n",
       "       Espumante             36\n",
       "       Cabernet Sauvignon    32\n",
       "       Prosecco              28\n",
       "       Malbec                28\n",
       "       Merlot                18\n",
       "       Chardonnay            15\n",
       "       Pinot Grigio          11\n",
       "True   Prosecco              15\n",
       "       Pinot Grigio           6\n",
       "       Espumante              4\n",
       "       Malbec                 4\n",
       "       Merlot                 2\n",
       "       Champagne              2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['is_4'] = df.cluster==4\n",
    "df.groupby(\"is_4\").varietal.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>min_qty</th>\n",
       "      <th>discount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>is_4</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>62.948454</td>\n",
       "      <td>59.945017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>18.363636</td>\n",
       "      <td>55.393939</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         min_qty   discount\n",
       "is_4                       \n",
       "False  62.948454  59.945017\n",
       "True   18.363636  55.393939"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(\"is_4\")[['min_qty', 'discount']].mean()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
diff --git a/Customer Segmentation in R.ipynb b/Customer Segmentation in R.ipynb
diff --git a/customer_segmentation.R b/customer_segmentation.R
 ---
 title: "Customer Segmentation in R (Riffing off of @YhatHQ's Python Post) #rstats"
 author: "Bob Rudis (@hrbrmstr)"
 output:
  html_document:
    theme: spacelab
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE,
                      collapse = TRUE,
                      fig.retina = 2,
                      warning = FALSE,
                      message = FALSE,
                      error = FALSE)
 library(pander)
 library(DT)
 ```

 Greg, over at&nbsp; $\hat{y}hat$, did a [really nice job](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) introducing basic K-Means clustering with Principal Component Analysis to look at commonalities between sets of customers.

 I thought it would be interesting to have an R-verison of it available in the event someone wanted to see a comparison of the solution between the two languages (and, it's a really compact example that I might be able to use in the classes I'm teaching at some point).

 You'll need to keep [their post handy](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) since I'm only riffing off of them, not pilfering their content or idea (and they deserve your eyeballs as they contribute quite a bit to the data science community on a regular basis).

 ## The Data

 We'll start by reading in the same data set, but first we'll load some pacakges we'll need to help us with the analyses &amp; visualizations:

 ```{r pkgs}
 library(readxl)    # free data from excel hades
 library(dplyr)     # sane data manipulation
 library(tidyr)     # sane data munging
 library(viridis)   # sane colors
 library(ggplot2)   # needs no introduction
 library(ggfortify) # super-helpful for plotting non-"standard" stats objects
 ```

 >NOTE: To use `ggfortify` you'll need to `devtools::install_github("sinhrks/ggfortify)` since it's not in CRAN.

 Now, we'll read in the file, taking care to only download it once from the&nbsp; $\hat{y}hat$ servers:

 ```{r dl}
 url <- "http://blog.yhathq.com/static/misc/data/WineKMC.xlsx"
 fil <- basename(url)
 if (!file.exists(fil)) download.file(url, fil)
 ```

 Reading in and cleaning up the sheets looks _really_ similar to the Python example:

 ```{r rd1}
 offers <- read_excel(fil, sheet = 1)
 colnames(offers) <- c("offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak")
 ```
 ```{r rd1_hd, eval=FALSE}
 head(offers)
 ```
 ```{r rd1_hd_pander, echo=FALSE}
 pander(head(offers))
 ```
 ```{r rd2}
 transactions <- read_excel(fil, sheet = 2)
 colnames(transactions) <- c("customer_name", "offer_id")
 transactions$n <- 1
 ```
 ```{r rd2_hd, eval=FALSE}
 head(transactions)
 ```
 ```{r rd2_hd_pander, echo=FALSE}
 pander(head(transactions))
 ```

 The `dplyr` package offers a very SQL-esque way of manipulating data and&mdash;combined with the `tidyr` package&mdash;makes quick work of getting the data into the the binary wide-form we need:

 ```{r munge}
 # join the offers and transactions table
 left_join(offers, transactions, by="offer_id") %>% 
 # get the number of times each customer responded to a given offer
  count(customer_name, offer_id, wt=n) %>%
 # change it from long to wide
  spread(offer_id, n) %>%
 # and fill in the NAs that get generated as a result
  mutate_each(funs(ifelse(is.na(.), 0, .))) -> dat
 ```

 ## Clustering our customers

 With the data in shape we can perform the same K-Means clustering:

 ```{r clus1}
 fit <- kmeans(dat[,-1], 5, iter.max=1000)
 ```
 ```{r clus1_tab, eval=FALSE}
 table(fit$cluster)
 ```
 ```{r clus1_pander, echo=FALSE}
 pander(table(fit$cluster))
 ```
 ```{r clus1_barplot}
 barplot(table(fit$cluster), col="maroon")
 ```

 ## Visualizing the clusters

 The same Principal Component Analysis to get the scatterplot:

 ```{r pcamds}
 pca <- prcomp(dat[,-1])
 pca_dat <- mutate(fortify(pca), col=fit$cluster)
 ggplot(pca_dat) +
  geom_point(aes(x=PC1, y=PC2, fill=factor(col)), size=3, col="#7f7f7f", shape=21) +
  scale_fill_viridis(name="Cluster", discrete=TRUE) + theme_bw(base_family="Helvetica")
 ```

 We can use the handy `autoplot` feature of `ggfortify` to do all that for us, tho:

 ```{r kmeans_pca}
 autoplot(fit, data=dat[,-1], frame=TRUE, frame.type='norm')
 ```

 ## Digging deeper into the clusters

 The cluster-based customer introspection is also equally as easy:

 ```{r intro1}
 transactions %>% 
  left_join(data_frame(customer_name=dat$customer_name, 
                       cluster=fit$cluster)) %>% 
  left_join(offers) -> customer_clusters

 customer_clusters %>% 
  mutate(is_4=(cluster==4)) %>% 
  count(is_4, varietal) -> varietal_4
 ```
 ```{r nodo, eval=FALSE}
 varietal_4
 ```
 ```{r intro_dt, echo=FALSE}
 datatable(varietal_4, options=list(pageLength=nrow(varietal_4)))
 ```

 ```{r ismean}
 customer_clusters %>% 
  mutate(is_4=(cluster==4)) %>% 
  group_by(is_4) %>% 
  summarise_each(funs(mean), min_qty, discount) -> mean_4
 ```
 ```{r nodo2, eval=FALSE}
 mean_4
 ```
 ```{r mean_pan, echo=FALSE}
 pander(mean_4)
 ```

 ## Fin

 Remember to check out all the [things](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) the&nbsp; $\hat{y}hat$ folks had to say about the analyses and the links they provided.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>offer_id</th>\n",
	" <th>campaign</th>\n",
	" <th>varietal</th>\n",
	" <th>min_qty</th>\n",
	" <th>discount</th>\n",
	" <th>origin</th>\n",
	" <th>past_peak</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>January</td>\n",
	" <td>Malbec</td>\n",
	" <td>72</td>\n",
	" <td>56</td>\n",
	" <td>France</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2</td>\n",
	" <td>January</td>\n",
	" <td>Pinot Noir</td>\n",
	" <td>72</td>\n",
	" <td>17</td>\n",
	" <td>France</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3</td>\n",
	" <td>February</td>\n",
	" <td>Espumante</td>\n",
	" <td>144</td>\n",
	" <td>32</td>\n",
	" <td>Oregon</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4</td>\n",
	" <td>February</td>\n",
	" <td>Champagne</td>\n",
	" <td>72</td>\n",
	" <td>48</td>\n",
	" <td>France</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>5</td>\n",
	" <td>February</td>\n",
	" <td>Cabernet Sauvignon</td>\n",
	" <td>144</td>\n",
	" <td>44</td>\n",
	" <td>New Zealand</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" offer_id campaign varietal min_qty discount origin \\\n",
	"0 1 January Malbec 72 56 France \n",
	"1 2 January Pinot Noir 72 17 France \n",
	"2 3 February Espumante 144 32 Oregon \n",
	"3 4 February Champagne 72 48 France \n",
	"4 5 February Cabernet Sauvignon 144 44 New Zealand \n",
	"\n",
	" past_peak \n",
	"0 False \n",
	"1 False \n",
	"2 True \n",
	"3 True \n",
	"4 True "
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"import wget\n",
	"import os\n",
	"\n",
	"if (not os.path.isfile(\"WineKMC.xlsx\")):\n",
	" wget.download(\"http://blog.yhathq.com/static/misc/data/WineKMC.xlsx\")\n",
	"\n",
	"df_offers = pd.read_excel(\"WineKMC.xlsx\", sheetname=0)\n",
	"df_offers.columns = [\"offer_id\", \"campaign\", \"varietal\", \"min_qty\", \"discount\", \"origin\", \"past_peak\"]\n",
	"df_offers.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>customer_name</th>\n",
	" <th>offer_id</th>\n",
	" <th>n</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Smith</td>\n",
	" <td>2</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Smith</td>\n",
	" <td>24</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Johnson</td>\n",
	" <td>17</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Johnson</td>\n",
	" <td>24</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>Johnson</td>\n",
	" <td>26</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" customer_name offer_id n\n",
	"0 Smith 2 1\n",
	"1 Smith 24 1\n",
	"2 Johnson 17 1\n",
	"3 Johnson 24 1\n",
	"4 Johnson 26 1"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_transactions = pd.read_excel(\"/Users/bob/Development/WineKMC.xlsx\", sheetname=1)\n",
	"df_transactions.columns = [\"customer_name\", \"offer_id\"]\n",
	"df_transactions['n'] = 1\n",
	"df_transactions.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# join the offers and transactions table\n",
	"df = pd.merge(df_offers, df_transactions)\n",
	"# create a \"pivot table\" which will give us the number of times each \n",
	"# customer responded to a given variable\n",
	"matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')\n",
	"# a little tidying up. fill NA values with 0 and make the index into a column\n",
	"matrix = matrix.fillna(0).reset_index()\n",
	"x_cols = matrix.columns[1:]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.cluster import KMeans"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"2 34\n",
	"0 21\n",
	"3 17\n",
	"1 16\n",
	"4 12\n",
	"dtype: int64"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"cluster = KMeans(n_clusters=5)\n",
	"# slice matrix so we only include the 0/1 indicator columns in the clustering\n",
	"matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[2:]])\n",
	"matrix.cluster.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.decomposition import PCA"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pca = PCA(n_components=2)\n",
	"matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]\n",
	"matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]\n",
	"matrix = matrix.reset_index()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th>offer_id</th>\n",
	" <th>customer_name</th>\n",
	" <th>cluster</th>\n",
	" <th>x</th>\n",
	" <th>y</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>Adams</td>\n",
	" <td>3</td>\n",
	" <td>-1.007580</td>\n",
	" <td>0.108215</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Allen</td>\n",
	" <td>2</td>\n",
	" <td>0.287539</td>\n",
	" <td>0.044715</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>Anderson</td>\n",
	" <td>1</td>\n",
	" <td>0.392032</td>\n",
	" <td>1.038391</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Bailey</td>\n",
	" <td>4</td>\n",
	" <td>-0.699477</td>\n",
	" <td>-0.022542</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>Baker</td>\n",
	" <td>4</td>\n",
	" <td>-0.088183</td>\n",
	" <td>-0.471695</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	"offer_id customer_name cluster x y\n",
	"0 Adams 3 -1.007580 0.108215\n",
	"1 Allen 2 0.287539 0.044715\n",
	"2 Anderson 1 0.392032 1.038391\n",
	"3 Bailey 4 -0.699477 -0.022542\n",
	"4 Baker 4 -0.088183 -0.471695"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]\n",
	"customer_clusters.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df = pd.merge(df_transactions, customer_clusters)\n",
	"df = pd.merge(df_offers, df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"is_4 \n",
	"False Champagne 79\n",
	" Pinot Noir 44\n",
	" Espumante 36\n",
	" Cabernet Sauvignon 32\n",
	" Prosecco 28\n",
	" Malbec 28\n",
	" Merlot 18\n",
	" Chardonnay 15\n",
	" Pinot Grigio 11\n",
	"True Prosecco 15\n",
	" Pinot Grigio 6\n",
	" Espumante 4\n",
	" Malbec 4\n",
	" Merlot 2\n",
	" Champagne 2\n",
	"dtype: int64"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df['is_4'] = df.cluster==4\n",
	"df.groupby(\"is_4\").varietal.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>min_qty</th>\n",
	" <th>discount</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>is_4</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>False</th>\n",
	" <td>62.948454</td>\n",
	" <td>59.945017</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>True</th>\n",
	" <td>18.363636</td>\n",
	" <td>55.393939</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" min_qty discount\n",
	"is_4 \n",
	"False 62.948454 59.945017\n",
	"True 18.363636 55.393939"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.groupby(\"is_4\")[['min_qty', 'discount']].mean()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
	---
	title: "Customer Segmentation in R (Riffing off of @YhatHQ's Python Post) #rstats"
	author: "Bob Rudis (@hrbrmstr)"
	output:
	html_document:
	theme: spacelab
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE,
	collapse = TRUE,
	fig.retina = 2,
	warning = FALSE,
	message = FALSE,
	error = FALSE)
	library(pander)
	library(DT)
	```

	Greg, over at  $\hat{y}hat$, did a [really nice job](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) introducing basic K-Means clustering with Principal Component Analysis to look at commonalities between sets of customers.

	I thought it would be interesting to have an R-verison of it available in the event someone wanted to see a comparison of the solution between the two languages (and, it's a really compact example that I might be able to use in the classes I'm teaching at some point).

	You'll need to keep [their post handy](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) since I'm only riffing off of them, not pilfering their content or idea (and they deserve your eyeballs as they contribute quite a bit to the data science community on a regular basis).

	## The Data

	We'll start by reading in the same data set, but first we'll load some pacakges we'll need to help us with the analyses & visualizations:

	```{r pkgs}
	library(readxl) # free data from excel hades
	library(dplyr) # sane data manipulation
	library(tidyr) # sane data munging
	library(viridis) # sane colors
	library(ggplot2) # needs no introduction
	library(ggfortify) # super-helpful for plotting non-"standard" stats objects
	```

	>NOTE: To use `ggfortify` you'll need to `devtools::install_github("sinhrks/ggfortify)` since it's not in CRAN.

	Now, we'll read in the file, taking care to only download it once from the  $\hat{y}hat$ servers:

	```{r dl}
	url <- "http://blog.yhathq.com/static/misc/data/WineKMC.xlsx"
	fil <- basename(url)
	if (!file.exists(fil)) download.file(url, fil)
	```

	Reading in and cleaning up the sheets looks _really_ similar to the Python example:

	```{r rd1}
	offers <- read_excel(fil, sheet = 1)
	colnames(offers) <- c("offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak")
	```
	```{r rd1_hd, eval=FALSE}
	head(offers)
	```
	```{r rd1_hd_pander, echo=FALSE}
	pander(head(offers))
	```
	```{r rd2}
	transactions <- read_excel(fil, sheet = 2)
	colnames(transactions) <- c("customer_name", "offer_id")
	transactions$n <- 1
	```
	```{r rd2_hd, eval=FALSE}
	head(transactions)
	```
	```{r rd2_hd_pander, echo=FALSE}
	pander(head(transactions))
	```

	The `dplyr` package offers a very SQL-esque way of manipulating data and—combined with the `tidyr` package—makes quick work of getting the data into the the binary wide-form we need:

	```{r munge}
	# join the offers and transactions table
	left_join(offers, transactions, by="offer_id") %>%
	# get the number of times each customer responded to a given offer
	count(customer_name, offer_id, wt=n) %>%
	# change it from long to wide
	spread(offer_id, n) %>%
	# and fill in the NAs that get generated as a result
	mutate_each(funs(ifelse(is.na(.), 0, .))) -> dat
	```

	## Clustering our customers

	With the data in shape we can perform the same K-Means clustering:

	```{r clus1}
	fit <- kmeans(dat[,-1], 5, iter.max=1000)
	```
	```{r clus1_tab, eval=FALSE}
	table(fit$cluster)
	```
	```{r clus1_pander, echo=FALSE}
	pander(table(fit$cluster))
	```
	```{r clus1_barplot}
	barplot(table(fit$cluster), col="maroon")
	```

	## Visualizing the clusters

	The same Principal Component Analysis to get the scatterplot:

	```{r pcamds}
	pca <- prcomp(dat[,-1])
	pca_dat <- mutate(fortify(pca), col=fit$cluster)
	ggplot(pca_dat) +
	geom_point(aes(x=PC1, y=PC2, fill=factor(col)), size=3, col="#7f7f7f", shape=21) +
	scale_fill_viridis(name="Cluster", discrete=TRUE) + theme_bw(base_family="Helvetica")
	```

	We can use the handy `autoplot` feature of `ggfortify` to do all that for us, tho:

	```{r kmeans_pca}
	autoplot(fit, data=dat[,-1], frame=TRUE, frame.type='norm')
	```

	## Digging deeper into the clusters

	The cluster-based customer introspection is also equally as easy:

	```{r intro1}
	transactions %>%
	left_join(data_frame(customer_name=dat$customer_name,
	cluster=fit$cluster)) %>%
	left_join(offers) -> customer_clusters

	customer_clusters %>%
	mutate(is_4=(cluster==4)) %>%
	count(is_4, varietal) -> varietal_4
	```
	```{r nodo, eval=FALSE}
	varietal_4
	```
	```{r intro_dt, echo=FALSE}
	datatable(varietal_4, options=list(pageLength=nrow(varietal_4)))
	```

	```{r ismean}
	customer_clusters %>%
	mutate(is_4=(cluster==4)) %>%
	group_by(is_4) %>%
	summarise_each(funs(mean), min_qty, discount) -> mean_4
	```
	```{r nodo2, eval=FALSE}
	mean_4
	```
	```{r mean_pan, echo=FALSE}
	pander(mean_4)
	```

	## Fin

	Remember to check out all the [things](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) the  $\hat{y}hat$ folks had to say about the analyses and the links they provided.