Forked from hrbrmstr/Customer Segmentation in Python.ipynb
Created
August 21, 2016 21:23
-
-
Save benadaba/f4c434d09d7121ad44d459466aca254d to your computer and use it in GitHub Desktop.
For RPubs post
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>offer_id</th>\n", | |
" <th>campaign</th>\n", | |
" <th>varietal</th>\n", | |
" <th>min_qty</th>\n", | |
" <th>discount</th>\n", | |
" <th>origin</th>\n", | |
" <th>past_peak</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>January</td>\n", | |
" <td>Malbec</td>\n", | |
" <td>72</td>\n", | |
" <td>56</td>\n", | |
" <td>France</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>January</td>\n", | |
" <td>Pinot Noir</td>\n", | |
" <td>72</td>\n", | |
" <td>17</td>\n", | |
" <td>France</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>February</td>\n", | |
" <td>Espumante</td>\n", | |
" <td>144</td>\n", | |
" <td>32</td>\n", | |
" <td>Oregon</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>February</td>\n", | |
" <td>Champagne</td>\n", | |
" <td>72</td>\n", | |
" <td>48</td>\n", | |
" <td>France</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>February</td>\n", | |
" <td>Cabernet Sauvignon</td>\n", | |
" <td>144</td>\n", | |
" <td>44</td>\n", | |
" <td>New Zealand</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" offer_id campaign varietal min_qty discount origin \\\n", | |
"0 1 January Malbec 72 56 France \n", | |
"1 2 January Pinot Noir 72 17 France \n", | |
"2 3 February Espumante 144 32 Oregon \n", | |
"3 4 February Champagne 72 48 France \n", | |
"4 5 February Cabernet Sauvignon 144 44 New Zealand \n", | |
"\n", | |
" past_peak \n", | |
"0 False \n", | |
"1 False \n", | |
"2 True \n", | |
"3 True \n", | |
"4 True " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import wget\n", | |
"import os\n", | |
"\n", | |
"if (not os.path.isfile(\"WineKMC.xlsx\")):\n", | |
" wget.download(\"http://blog.yhathq.com/static/misc/data/WineKMC.xlsx\")\n", | |
"\n", | |
"df_offers = pd.read_excel(\"WineKMC.xlsx\", sheetname=0)\n", | |
"df_offers.columns = [\"offer_id\", \"campaign\", \"varietal\", \"min_qty\", \"discount\", \"origin\", \"past_peak\"]\n", | |
"df_offers.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>customer_name</th>\n", | |
" <th>offer_id</th>\n", | |
" <th>n</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Smith</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Smith</td>\n", | |
" <td>24</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Johnson</td>\n", | |
" <td>17</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Johnson</td>\n", | |
" <td>24</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Johnson</td>\n", | |
" <td>26</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" customer_name offer_id n\n", | |
"0 Smith 2 1\n", | |
"1 Smith 24 1\n", | |
"2 Johnson 17 1\n", | |
"3 Johnson 24 1\n", | |
"4 Johnson 26 1" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_transactions = pd.read_excel(\"/Users/bob/Development/WineKMC.xlsx\", sheetname=1)\n", | |
"df_transactions.columns = [\"customer_name\", \"offer_id\"]\n", | |
"df_transactions['n'] = 1\n", | |
"df_transactions.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# join the offers and transactions table\n", | |
"df = pd.merge(df_offers, df_transactions)\n", | |
"# create a \"pivot table\" which will give us the number of times each \n", | |
"# customer responded to a given variable\n", | |
"matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')\n", | |
"# a little tidying up. fill NA values with 0 and make the index into a column\n", | |
"matrix = matrix.fillna(0).reset_index()\n", | |
"x_cols = matrix.columns[1:]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cluster import KMeans" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2 34\n", | |
"0 21\n", | |
"3 17\n", | |
"1 16\n", | |
"4 12\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cluster = KMeans(n_clusters=5)\n", | |
"# slice matrix so we only include the 0/1 indicator columns in the clustering\n", | |
"matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[2:]])\n", | |
"matrix.cluster.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.decomposition import PCA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"pca = PCA(n_components=2)\n", | |
"matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]\n", | |
"matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]\n", | |
"matrix = matrix.reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>offer_id</th>\n", | |
" <th>customer_name</th>\n", | |
" <th>cluster</th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Adams</td>\n", | |
" <td>3</td>\n", | |
" <td>-1.007580</td>\n", | |
" <td>0.108215</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Allen</td>\n", | |
" <td>2</td>\n", | |
" <td>0.287539</td>\n", | |
" <td>0.044715</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Anderson</td>\n", | |
" <td>1</td>\n", | |
" <td>0.392032</td>\n", | |
" <td>1.038391</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Bailey</td>\n", | |
" <td>4</td>\n", | |
" <td>-0.699477</td>\n", | |
" <td>-0.022542</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Baker</td>\n", | |
" <td>4</td>\n", | |
" <td>-0.088183</td>\n", | |
" <td>-0.471695</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"offer_id customer_name cluster x y\n", | |
"0 Adams 3 -1.007580 0.108215\n", | |
"1 Allen 2 0.287539 0.044715\n", | |
"2 Anderson 1 0.392032 1.038391\n", | |
"3 Bailey 4 -0.699477 -0.022542\n", | |
"4 Baker 4 -0.088183 -0.471695" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]\n", | |
"customer_clusters.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.merge(df_transactions, customer_clusters)\n", | |
"df = pd.merge(df_offers, df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"is_4 \n", | |
"False Champagne 79\n", | |
" Pinot Noir 44\n", | |
" Espumante 36\n", | |
" Cabernet Sauvignon 32\n", | |
" Prosecco 28\n", | |
" Malbec 28\n", | |
" Merlot 18\n", | |
" Chardonnay 15\n", | |
" Pinot Grigio 11\n", | |
"True Prosecco 15\n", | |
" Pinot Grigio 6\n", | |
" Espumante 4\n", | |
" Malbec 4\n", | |
" Merlot 2\n", | |
" Champagne 2\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['is_4'] = df.cluster==4\n", | |
"df.groupby(\"is_4\").varietal.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>min_qty</th>\n", | |
" <th>discount</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>is_4</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>False</th>\n", | |
" <td>62.948454</td>\n", | |
" <td>59.945017</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>True</th>\n", | |
" <td>18.363636</td>\n", | |
" <td>55.393939</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" min_qty discount\n", | |
"is_4 \n", | |
"False 62.948454 59.945017\n", | |
"True 18.363636 55.393939" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.groupby(\"is_4\")[['min_qty', 'discount']].mean()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Customer Segmentation in R (Riffing off of @YhatHQ's Python Post) #rstats" | |
author: "Bob Rudis (@hrbrmstr)" | |
output: | |
html_document: | |
theme: spacelab | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE, | |
collapse = TRUE, | |
fig.retina = 2, | |
warning = FALSE, | |
message = FALSE, | |
error = FALSE) | |
library(pander) | |
library(DT) | |
``` | |
Greg, over at $\hat{y}hat$, did a [really nice job](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) introducing basic K-Means clustering with Principal Component Analysis to look at commonalities between sets of customers. | |
I thought it would be interesting to have an R-verison of it available in the event someone wanted to see a comparison of the solution between the two languages (and, it's a really compact example that I might be able to use in the classes I'm teaching at some point). | |
You'll need to keep [their post handy](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) since I'm only riffing off of them, not pilfering their content or idea (and they deserve your eyeballs as they contribute quite a bit to the data science community on a regular basis). | |
## The Data | |
We'll start by reading in the same data set, but first we'll load some pacakges we'll need to help us with the analyses & visualizations: | |
```{r pkgs} | |
library(readxl) # free data from excel hades | |
library(dplyr) # sane data manipulation | |
library(tidyr) # sane data munging | |
library(viridis) # sane colors | |
library(ggplot2) # needs no introduction | |
library(ggfortify) # super-helpful for plotting non-"standard" stats objects | |
``` | |
>NOTE: To use `ggfortify` you'll need to `devtools::install_github("sinhrks/ggfortify)` since it's not in CRAN. | |
Now, we'll read in the file, taking care to only download it once from the $\hat{y}hat$ servers: | |
```{r dl} | |
url <- "http://blog.yhathq.com/static/misc/data/WineKMC.xlsx" | |
fil <- basename(url) | |
if (!file.exists(fil)) download.file(url, fil) | |
``` | |
Reading in and cleaning up the sheets looks _really_ similar to the Python example: | |
```{r rd1} | |
offers <- read_excel(fil, sheet = 1) | |
colnames(offers) <- c("offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak") | |
``` | |
```{r rd1_hd, eval=FALSE} | |
head(offers) | |
``` | |
```{r rd1_hd_pander, echo=FALSE} | |
pander(head(offers)) | |
``` | |
```{r rd2} | |
transactions <- read_excel(fil, sheet = 2) | |
colnames(transactions) <- c("customer_name", "offer_id") | |
transactions$n <- 1 | |
``` | |
```{r rd2_hd, eval=FALSE} | |
head(transactions) | |
``` | |
```{r rd2_hd_pander, echo=FALSE} | |
pander(head(transactions)) | |
``` | |
The `dplyr` package offers a very SQL-esque way of manipulating data and—combined with the `tidyr` package—makes quick work of getting the data into the the binary wide-form we need: | |
```{r munge} | |
# join the offers and transactions table | |
left_join(offers, transactions, by="offer_id") %>% | |
# get the number of times each customer responded to a given offer | |
count(customer_name, offer_id, wt=n) %>% | |
# change it from long to wide | |
spread(offer_id, n) %>% | |
# and fill in the NAs that get generated as a result | |
mutate_each(funs(ifelse(is.na(.), 0, .))) -> dat | |
``` | |
## Clustering our customers | |
With the data in shape we can perform the same K-Means clustering: | |
```{r clus1} | |
fit <- kmeans(dat[,-1], 5, iter.max=1000) | |
``` | |
```{r clus1_tab, eval=FALSE} | |
table(fit$cluster) | |
``` | |
```{r clus1_pander, echo=FALSE} | |
pander(table(fit$cluster)) | |
``` | |
```{r clus1_barplot} | |
barplot(table(fit$cluster), col="maroon") | |
``` | |
## Visualizing the clusters | |
The same Principal Component Analysis to get the scatterplot: | |
```{r pcamds} | |
pca <- prcomp(dat[,-1]) | |
pca_dat <- mutate(fortify(pca), col=fit$cluster) | |
ggplot(pca_dat) + | |
geom_point(aes(x=PC1, y=PC2, fill=factor(col)), size=3, col="#7f7f7f", shape=21) + | |
scale_fill_viridis(name="Cluster", discrete=TRUE) + theme_bw(base_family="Helvetica") | |
``` | |
We can use the handy `autoplot` feature of `ggfortify` to do all that for us, tho: | |
```{r kmeans_pca} | |
autoplot(fit, data=dat[,-1], frame=TRUE, frame.type='norm') | |
``` | |
## Digging deeper into the clusters | |
The cluster-based customer introspection is also equally as easy: | |
```{r intro1} | |
transactions %>% | |
left_join(data_frame(customer_name=dat$customer_name, | |
cluster=fit$cluster)) %>% | |
left_join(offers) -> customer_clusters | |
customer_clusters %>% | |
mutate(is_4=(cluster==4)) %>% | |
count(is_4, varietal) -> varietal_4 | |
``` | |
```{r nodo, eval=FALSE} | |
varietal_4 | |
``` | |
```{r intro_dt, echo=FALSE} | |
datatable(varietal_4, options=list(pageLength=nrow(varietal_4))) | |
``` | |
```{r ismean} | |
customer_clusters %>% | |
mutate(is_4=(cluster==4)) %>% | |
group_by(is_4) %>% | |
summarise_each(funs(mean), min_qty, discount) -> mean_4 | |
``` | |
```{r nodo2, eval=FALSE} | |
mean_4 | |
``` | |
```{r mean_pan, echo=FALSE} | |
pander(mean_4) | |
``` | |
## Fin | |
Remember to check out all the [things](http://blog.yhathq.com/posts/customer-segmentation-using-python.html) the $\hat{y}hat$ folks had to say about the analyses and the links they provided. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment