Created
July 11, 2025 13:45
-
-
Save thedivtagguy/009e84189c624121609e81e0bb7c7cb1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "WhatsApp Group Chat Analysis" | |
output: | |
pdf_document: default | |
--- | |
Hello folks, glad you could make it. | |
This is the entire code (well, almost entire) used to create a visual essay on group | |
WhatsApp chats. You can use bits of this to analyze any chat you want. | |
The code is commented to the best of my abilities but if something is | |
unclear, feel free to reach out! | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
#Read in the libraries | |
library(rwhatsapp) #for cleaning the chats and making them into a dataframe | |
library(tidytext) #for text processing | |
library(tidyverse) #It'll be stupid not to use this. | |
library(tidyr) #More data wrangling | |
library(plotly) #Making interactive charts! | |
library(qdap) #Among other text analysis things, I used this for counting questions | |
library(lubridate) #date handling | |
library(data.table) | |
library(digest) | |
library(htmlwidgets) | |
library(stringr) | |
library(textdata) | |
library(prettydoc) | |
``` | |
```{r whatsapp read in} | |
# Replace with your own file path | |
chat <- rwa_read("path/to/your/whatsapp/export.txt") %>% | |
filter(!is.na(author)) # remove messages without author | |
chat <- chat %>% select(-emoji, - source, -emoji_name) | |
``` | |
The below chunk anonymizes the chats. This has already been done for the provided dataset. | |
For your own chat, specify the column names you want to hash in the cols_to_mask variable. | |
With hashing, names are converted to alphanumeric strings for privacy protection. | |
```{r anonymize chats} | |
#Which Columns Should This Affect? | |
cols_to_mask <- c("author") | |
anonymize <- function(x, algo="sha256") { | |
sapply(x, function(y) if(y == "" | is.na(y)) "" else digest(y, algo = algo)) | |
} | |
setDT(chat) | |
chat[, (cols_to_mask) := lapply(.SD, anonymize), .SDcols = cols_to_mask] | |
``` | |
```{r data wrangling} | |
#Separate Date components | |
chat <- chat %>% | |
mutate(day = day(time)) %>% #Pull out the day component from the time column | |
mutate(times = strftime(chat$time, format="%H:%M:%S")) %>% #Pull out the time | |
mutate(date = date(time)) #Pull out the date component | |
#Separate by custom time periods (adjust dates as needed for your analysis) | |
chat <- chat %>% | |
mutate( | |
# Message Segments - customize these time periods for your own analysis | |
timeblock = case_when( | |
time >= dmy("01/01/2023") & time <= dmy("31/01/2023") ~ "Period 1", | |
time >= dmy("01/02/2023") & time <= dmy("28/02/2023") ~ "Period 2", | |
time >= dmy("01/03/2023") & time <= dmy("31/03/2023") ~ "Period 3", | |
time >= dmy("01/04/2023") & time <= dmy("30/04/2023") ~ "Period 4", | |
time >= dmy("01/05/2023") & time <= dmy("31/05/2023") ~ "Period 5", | |
time >= dmy("01/06/2023") & time <= dmy("30/06/2023") ~ "Period 6", | |
time >= dmy("01/07/2023") & time <= dmy("31/07/2023") ~ "Period 7", | |
time >= dmy("01/08/2023") & time <= dmy("31/08/2023") ~ "Period 8", | |
time >= dmy("01/09/2023") & time <= dmy("30/09/2023") ~ "Period 9", | |
time >= dmy("01/10/2023") & time <= dmy("31/10/2023") ~ "Period 10", | |
time >= dmy("01/11/2023") & time <= dmy("30/11/2023") ~ "Period 11", | |
time >= dmy("01/12/2023") & time <= dmy("31/12/2023") ~ "Period 12", | |
T ~ "Other")) %>% | |
mutate(timeblock = factor(timeblock)) #make this a factor so that you can count it | |
#Classify the times into hourly intervals | |
chat <- chat %>% | |
mutate(dayblock = format(time, "%I %p")) %>% | |
mutate(dayblock = factor(dayblock)) | |
``` | |
```{r timeline} | |
#Timeline of Messages with Plotly | |
counts <- data.frame(table(chat$day)) | |
fig <- plot_ly(counts, x = ~Var1, y = ~Freq, type = 'scatter', mode = 'lines+markers', | |
line = list(color = 'royalblue', width = 4), | |
marker = list(color = 'royalblue', width = .4), | |
width=1500, height=500) %>% | |
layout(title = "Messages over time - Group Chat Analysis", | |
xaxis = list(title = "Date"), | |
yaxis = list(title = "No. of Messages")) | |
fig | |
#Timeline by Time Periods | |
units <- chat %>% | |
group_by(date) %>% | |
count(timeblock) | |
``` | |
```{r people} | |
people <- chat %>% | |
count(author) | |
#Plot messaging activity by user | |
people$author <- factor(people$author, levels = unique(people$author)[order(people$n, decreasing = FALSE)]) | |
ax_x <- list( | |
showticklabels = FALSE, | |
title = "Author (anonymized for privacy)" | |
) | |
ax_y <- list(title = "Number of Messages") | |
people_fig <- plot_ly(people, x = ~author, y = ~n, type = "bar", name = 'Messaging Activity') | |
people_fig <- people_fig %>% layout(xaxis = ax_x, yaxis = ax_y, title = "Number of Messages by Group Members") | |
people_fig | |
#Export As HTML Widget (specify your own path) | |
# saveWidget(people_fig, file="output/message_activity.html") | |
``` | |
Sentiment analysis using the tidytext approach. | |
This code is adapted from Julia Silge's excellent tidytextmining book: | |
https://www.tidytextmining.com/sentiment.html | |
```{r sentiment} | |
#Unnest chat into one word per row, excluding WhatsApp service messages | |
tidy_chat <- chat %>% | |
filter(text != "This message was deleted|<Media omitted>") %>% | |
group_by(author, day) %>% | |
ungroup() %>% | |
unnest_tokens(word, text) | |
#Use the Bing emotion lexicon to calculate sentiment | |
chat_sentiment <- tidy_chat %>% | |
inner_join(get_sentiments("bing")) %>% | |
count(index = date, sentiment) %>% | |
spread(sentiment, n, fill = 0) %>% | |
mutate(sentiment = positive - negative) | |
#Plot sentiment over time | |
fig_sent <- plot_ly(chat_sentiment, x = ~index, y = ~sentiment, type = 'scatter', mode = 'lines+markers', | |
line = list(color = 'royalblue', width = 4), | |
marker = list(color = 'royalblue', width = .4), | |
width=1500, height=500) %>% | |
layout(title = "Sentiment Analysis Over Time", | |
xaxis = list(title = "Date"), | |
yaxis = list(title = "Sentiment Score")) | |
fig_sent | |
``` | |
Analysis of agreement patterns in the group chat. | |
```{r agreement_analysis} | |
#Count the Number of Messages per person | |
chat_count <- chat %>% | |
group_by(author) %>% | |
count(author, name = "n_texts") | |
#Group agreement expressions by Author and Time Period | |
agreement_count <- chat %>% | |
filter(text == "+1" | text == "agree" | text == "yes" | text == "👍") %>% | |
group_by(timeblock, author) %>% | |
count(author, name = "agreements") | |
#Percentage of agreement by author | |
counts_by_author <- left_join(chat_count, agreement_count, by = c("author")) %>% | |
filter(!is.na(agreements)) %>% | |
mutate(agreement_perc = (agreements/n_texts)*100) | |
#Round percentages | |
counts_by_author$agreement_perc <- counts_by_author$agreement_perc %>% round(2) | |
#Find users who only send agreement messages | |
only_agreement_users <- counts_by_author %>% | |
filter(agreements != "NA") %>% | |
mutate(other_messages = n_texts - agreements) %>% | |
filter(other_messages == 0) | |
print("Users who only send agreement messages:") | |
print(nrow(only_agreement_users)) | |
``` | |
Counting shared links and surveys in the group | |
```{r links_and_surveys} | |
#Number of WhatsApp group invite links | |
chat %>% | |
filter(str_detect(text, "^(https?://)?chat.whatsapp.com/(?:invite/)?([a-zA-Z0-9_-]{22})$")==TRUE) %>% | |
count() | |
#Survey links (most survey platforms use 'forms' in their URLs) | |
chat %>% | |
filter(str_detect(text, "forms\\.")==TRUE) %>% | |
count() | |
surveys <- chat %>% | |
filter(str_detect(text, "forms\\.")== TRUE) | |
print("Survey links shared:") | |
print(nrow(surveys)) | |
``` | |
Analysis of frequently mentioned topics or keywords | |
```{r keyword_analysis} | |
# Define keywords to search for (customize based on your group's context) | |
keywords <- c("meeting", "deadline", "project", "update", "help", "question") | |
keyword_mentions <- data.frame( | |
keyword = character(), | |
mentions = numeric(), | |
stringsAsFactors = FALSE | |
) | |
for(keyword in keywords) { | |
count <- chat %>% | |
filter(str_detect(tolower(text), tolower(keyword))) %>% | |
nrow() | |
keyword_mentions <- rbind(keyword_mentions, data.frame(keyword = keyword, mentions = count)) | |
} | |
# Plot keyword frequency | |
fig_keywords <- plot_ly(keyword_mentions, x = ~mentions, y = ~keyword, type = 'scatter', mode = 'markers', | |
color = ~keyword, size = ~mentions, | |
marker = list(opacity = 1, sizemode = 'diameter')) | |
fig_keywords <- fig_keywords %>% layout( | |
xaxis = list(showgrid = FALSE, title = "Number of Mentions"), | |
yaxis = list(showgrid = TRUE, title = "Keywords"), | |
backgroundcolor = "#f2f2f2", | |
showlegend = FALSE, | |
title = "Keyword Frequency Analysis" | |
) | |
fig_keywords | |
``` | |
Messages by Day of the Week | |
```{r day_analysis} | |
# MESSAGES PER DAY OF THE WEEK | |
chat %>% | |
mutate(wday.num = wday(day), | |
wday.name = weekdays(day)) %>% | |
group_by(timeblock, wday.num, wday.name) %>% | |
count() %>% | |
ggplot(aes(x = reorder(wday.name, -wday.num), y = n, fill=timeblock)) + | |
geom_bar(stat = "identity") + | |
ylab("Number of Messages") + xlab("Day of Week") + | |
coord_flip() + | |
ggtitle("Message Activity by Day of Week and Time Period") + | |
theme_minimal() + | |
theme(legend.title = element_blank(), | |
legend.position = "bottom") | |
``` | |
Question Analysis | |
```{r question_analysis} | |
questions <- question_type(chat$text) #Extract all questions automatically | |
questions_text <- questions[["raw"]][["strip.text"]] #store questions separately | |
questions_text <- as.data.frame(questions_text) #convert to dataframe | |
#Categorize questions by type | |
questions_type <- questions_text %>% | |
mutate(qtype = case_when( | |
str_detect(questions_text, "who") ~ "who", | |
str_detect(questions_text,"how") ~ "how", | |
str_detect(questions_text, "whose") ~ "whose", | |
str_detect(questions_text, "whom") ~ "whom", | |
str_detect(questions_text, "where") ~ "where", | |
str_detect(questions_text, "what") ~ "what", | |
str_detect(questions_text, "which") ~ "which", | |
str_detect(questions_text, "why") ~ "why", | |
str_detect(questions_text, "when") ~ "when", | |
str_detect(questions_text, "were") ~ "were", | |
str_detect(questions_text, "was") ~ "was", | |
str_detect(questions_text, "does") ~ "does", | |
str_detect(questions_text, "did") ~ "did", | |
str_detect(questions_text, "do") ~ "do", | |
str_detect(questions_text, "\\bis\\b") ~ "is", | |
str_detect(questions_text, "are") ~ "are", | |
str_detect(questions_text, "will") ~ "will", | |
str_detect(questions_text, "should") ~ "should", | |
str_detect(questions_text, "could") ~ "could", | |
str_detect(questions_text, "shall") ~ "shall", | |
str_detect(questions_text, "might") ~ "might", | |
str_detect(questions_text, "can") ~ "can", | |
str_detect(questions_text, "has") ~ "has", | |
str_detect(questions_text, "had") ~ "had", | |
str_detect(questions_text, "huh") ~ "huh", | |
str_detect(questions_text, "anyone") ~ "anyone", | |
T ~ "other")) %>% | |
mutate(qtype = factor(qtype)) | |
#Summary of question types | |
questions_breakdown <- questions_type %>% | |
count(qtype) %>% | |
arrange(-n) | |
print("Question type breakdown:") | |
print(questions_breakdown) | |
``` | |
N-gram Analysis for Common Phrases | |
```{r ngram_analysis} | |
# Remove URLs and media messages for cleaner analysis | |
url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" | |
chat_clean <- chat %>% | |
filter(str_detect(text, "<Media omitted>|This message was deleted|\\.vcf") == FALSE) %>% | |
filter(str_detect(text, url_pattern) == FALSE) | |
# Generate trigrams (3-word phrases) | |
trigrams <- chat_clean %>% | |
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% | |
filter(!is.na(trigram)) %>% | |
count(trigram, sort = TRUE) | |
print("Most common 3-word phrases:") | |
print(head(trigrams, 10)) | |
# Generate bigrams (2-word phrases) | |
bigrams <- chat_clean %>% | |
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% | |
filter(!is.na(bigram)) %>% | |
count(bigram, sort = TRUE) | |
print("Most common 2-word phrases:") | |
print(head(bigrams, 10)) | |
``` | |
Daily Active Users Analysis | |
```{r daily_active_users} | |
# Group by date and count number of distinct users | |
daily_active_users <- chat %>% | |
group_by(date) %>% | |
summarise(active_users = n_distinct(author), | |
total_messages = n()) %>% | |
arrange(date) | |
# Plot daily active users over time | |
fig_dau <- plot_ly(daily_active_users, x = ~date, y = ~active_users, type = 'scatter', mode = 'lines+markers', | |
line = list(color = 'green', width = 3), | |
marker = list(color = 'green', size = 6), | |
width=1500, height=500) %>% | |
layout(title = "Daily Active Users Over Time", | |
xaxis = list(title = "Date"), | |
yaxis = list(title = "Number of Active Users")) | |
fig_dau | |
``` | |
Hourly Activity Pattern | |
```{r hourly_activity} | |
# Analyze messaging patterns by hour of day | |
hourly_activity <- chat %>% | |
mutate(hour = hour(time)) %>% | |
group_by(hour) %>% | |
count() %>% | |
arrange(hour) | |
# Plot hourly activity | |
fig_hourly <- plot_ly(hourly_activity, x = ~hour, y = ~n, type = 'bar', | |
marker = list(color = 'orange')) %>% | |
layout(title = "Message Activity by Hour of Day", | |
xaxis = list(title = "Hour of Day (24-hour format)"), | |
yaxis = list(title = "Number of Messages")) | |
fig_hourly | |
``` | |
Message Length Analysis | |
```{r message_length} | |
# Calculate message length statistics | |
chat_with_length <- chat %>% | |
mutate(message_length = nchar(text)) %>% | |
filter(message_length > 0) # Remove empty messages | |
length_stats <- chat_with_length %>% | |
summarise( | |
avg_length = mean(message_length, na.rm = TRUE), | |
median_length = median(message_length, na.rm = TRUE), | |
max_length = max(message_length, na.rm = TRUE), | |
min_length = min(message_length, na.rm = TRUE) | |
) | |
print("Message length statistics:") | |
print(length_stats) | |
# Plot message length distribution | |
fig_length <- plot_ly(chat_with_length, x = ~message_length, type = 'histogram', | |
marker = list(color = 'purple', opacity = 0.7)) %>% | |
layout(title = "Distribution of Message Lengths", | |
xaxis = list(title = "Message Length (characters)"), | |
yaxis = list(title = "Frequency")) | |
fig_length | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment