Last active
September 30, 2020 10:40
-
-
Save tuhulab/c5e419b861533e5255d838f0816f58f6 to your computer and use it in GitHub Desktop.
Performing cluster analyis in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Cluster analysis in R | |
# inspired by Dima Gorenshteyn, DataCamp | |
## standardize data | |
df_st <- scale(df) | |
## Hierachical clustering | |
d <- dist(df) | |
hc <- hclust(d, "method") # method %in% c("complete", "average", "single") | |
c <- cutree(hc, h = the_height) # h: the height to cut the tree # assign cluster | |
s_df <- df %>% mutate(c) # segment data frame | |
s_df %>% count(c) | |
dend_g <- as.dendrogram(hc) # create dendrogram from hc | |
dend_g_c <- color_branches(dend_g, h = the_height) # the color of the dendrogram can be configured by the height | |
plot(dend_g_c) | |
## K-means clustering | |
k_m <- kmeans(df, centers = k) # define k | |
k_df <- df %>% mutate(k_m$cluster) | |
### K-means: Elbow method | |
tot_withinss <- purrr::map_dbl(1:(n_var-1), function(k){ # max(k) = variable (n_var - 1) | |
model <- kmeans(x = lineup, centers = k) | |
model$tot.withinss | |
}) # total within-cluster sum of squares | |
elbow_df <- data.frame(k = 1:(n_var-1), tot_withiness) | |
ggplot(elbow_df, aes(x = k, y = tot_withinss)) + | |
geom_line() + | |
scale_x_continuous(breaks = 1:10) | |
### K-means: Silhouette [silwɛt] analysis | |
# within cluster distance | |
# | |
m_pan <- cluster::pam(df, k) | |
cluster::silhouette(m_pan) %>% plot() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment