thedivtagguy · July 11, 2025 13:45
diff --git a/chat-analysis.Rmd b/chat-analysis.Rmd
 ---
 title: "WhatsApp Group Chat Analysis"
 output:
  pdf_document: default
 ---

 Hello folks, glad you could make it. 
 This is the entire code (well, almost entire) used to create a visual essay on group
 WhatsApp chats. You can use bits of this to analyze any chat you want. 

 The code is commented to the best of my abilities but if something is
 unclear, feel free to reach out!

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 #Read in the libraries
 library(rwhatsapp)     #for cleaning the chats and making them into a dataframe
 library(tidytext)      #for text processing
 library(tidyverse)     #It'll be stupid not to use this.
 library(tidyr)         #More data wrangling
 library(plotly)        #Making interactive charts!
 library(qdap)          #Among other text analysis things, I used this for counting questions
 library(lubridate)     #date handling
 library(data.table)
 library(digest)
 library(htmlwidgets)   
 library(stringr)
 library(textdata)
 library(prettydoc)
 ```

 ```{r whatsapp read in}
 # Replace with your own file path
 chat <- rwa_read("path/to/your/whatsapp/export.txt") %>%
  filter(!is.na(author)) # remove messages without author
 chat <- chat %>% select(-emoji, - source, -emoji_name)
 ```

 The below chunk anonymizes the chats. This has already been done for the provided dataset.
 For your own chat, specify the column names you want to hash in the cols_to_mask variable.

 With hashing, names are converted to alphanumeric strings for privacy protection.

 ```{r anonymize chats}
 #Which Columns Should This Affect?
 cols_to_mask <- c("author")

 anonymize <- function(x, algo="sha256") {
    sapply(x, function(y) if(y == "" | is.na(y)) "" else digest(y, algo = algo))
 }

 setDT(chat)
 chat[, (cols_to_mask) := lapply(.SD, anonymize), .SDcols = cols_to_mask]
 ```

 ```{r data wrangling}
 #Separate Date components
 chat <- chat %>%
  mutate(day = day(time)) %>%  #Pull out the day component from the time column
  mutate(times = strftime(chat$time, format="%H:%M:%S")) %>% #Pull out the time
  mutate(date = date(time))   #Pull out the date component

 #Separate by custom time periods (adjust dates as needed for your analysis)
 chat <- chat %>%  
 mutate(
 # Message Segments - customize these time periods for your own analysis
 timeblock = case_when(
 time >= dmy("01/01/2023") & time <= dmy("31/01/2023") ~ "Period 1",
 time >= dmy("01/02/2023") & time <= dmy("28/02/2023") ~ "Period 2",
 time >= dmy("01/03/2023") & time <= dmy("31/03/2023") ~ "Period 3",
 time >= dmy("01/04/2023") & time <= dmy("30/04/2023") ~ "Period 4",
 time >= dmy("01/05/2023") & time <= dmy("31/05/2023") ~ "Period 5",
 time >= dmy("01/06/2023") & time <= dmy("30/06/2023") ~ "Period 6",
 time >= dmy("01/07/2023") & time <= dmy("31/07/2023") ~ "Period 7",
 time >= dmy("01/08/2023") & time <= dmy("31/08/2023") ~ "Period 8",
 time >= dmy("01/09/2023") & time <= dmy("30/09/2023") ~ "Period 9",
 time >= dmy("01/10/2023") & time <= dmy("31/10/2023") ~ "Period 10",
 time >= dmy("01/11/2023") & time <= dmy("30/11/2023") ~ "Period 11",
 time >= dmy("01/12/2023") & time <= dmy("31/12/2023") ~ "Period 12",
 T ~ "Other")) %>% 
 mutate(timeblock = factor(timeblock))   #make this a factor so that you can count it

 #Classify the times into hourly intervals
 chat <- chat %>%
  mutate(dayblock = format(time, "%I %p")) %>%  
  mutate(dayblock = factor(dayblock))
 ```

 ```{r timeline}
 #Timeline of Messages with Plotly
 counts <- data.frame(table(chat$day))
 fig <- plot_ly(counts, x = ~Var1, y = ~Freq, type = 'scatter', mode = 'lines+markers',
               line = list(color = 'royalblue', width = 4), 
               marker = list(color = 'royalblue', width = .4), 
               width=1500, height=500) %>% 
  layout(title = "Messages over time - Group Chat Analysis",
         xaxis = list(title = "Date"),
         yaxis = list(title = "No. of Messages"))
 fig

 #Timeline by Time Periods
 units <- chat %>% 
  group_by(date) %>% 
  count(timeblock)
 ```

 ```{r people}
 people <- chat %>%  
  count(author) 

 #Plot messaging activity by user
 people$author <- factor(people$author, levels = unique(people$author)[order(people$n, decreasing = FALSE)])

 ax_x <- list(
  showticklabels = FALSE,
  title = "Author (anonymized for privacy)"
 )

 ax_y <- list(title = "Number of Messages")

 people_fig <- plot_ly(people, x = ~author, y = ~n, type = "bar", name = 'Messaging Activity')
 people_fig <- people_fig %>% layout(xaxis = ax_x, yaxis = ax_y, title = "Number of Messages by Group Members") 

 people_fig

 #Export As HTML Widget (specify your own path)
 # saveWidget(people_fig, file="output/message_activity.html")
 ```

 Sentiment analysis using the tidytext approach.
 This code is adapted from Julia Silge's excellent tidytextmining book:
 https://www.tidytextmining.com/sentiment.html

 ```{r sentiment}
 #Unnest chat into one word per row, excluding WhatsApp service messages
 tidy_chat <- chat %>%
  filter(text != "This message was deleted|<Media omitted>") %>% 
  group_by(author, day) %>%
  ungroup() %>%
  unnest_tokens(word, text)

 #Use the Bing emotion lexicon to calculate sentiment
 chat_sentiment <- tidy_chat %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = date, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

 #Plot sentiment over time
 fig_sent <- plot_ly(chat_sentiment, x = ~index, y = ~sentiment, type = 'scatter', mode = 'lines+markers',
                    line = list(color = 'royalblue', width = 4), 
                    marker = list(color = 'royalblue', width = .4), 
                    width=1500, height=500) %>% 
  layout(title = "Sentiment Analysis Over Time",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Sentiment Score"))
 fig_sent
 ```

 Analysis of agreement patterns in the group chat.

 ```{r agreement_analysis}
 #Count the Number of Messages per person
 chat_count <- chat %>% 
  group_by(author) %>% 
  count(author, name = "n_texts")

 #Group agreement expressions by Author and Time Period
 agreement_count <- chat %>% 
  filter(text == "+1" | text == "agree" | text == "yes" | text == "👍") %>% 
  group_by(timeblock, author) %>% 
  count(author, name = "agreements")

 #Percentage of agreement by author
 counts_by_author <- left_join(chat_count, agreement_count, by = c("author")) %>%                       
  filter(!is.na(agreements)) %>% 
  mutate(agreement_perc = (agreements/n_texts)*100)

 #Round percentages
 counts_by_author$agreement_perc <- counts_by_author$agreement_perc %>% round(2)

 #Find users who only send agreement messages
 only_agreement_users <- counts_by_author %>% 
  filter(agreements != "NA") %>% 
  mutate(other_messages = n_texts - agreements) %>% 
  filter(other_messages == 0)

 print("Users who only send agreement messages:")
 print(nrow(only_agreement_users))
 ```

 Counting shared links and surveys in the group

 ```{r links_and_surveys}
 #Number of WhatsApp group invite links
 chat %>% 
  filter(str_detect(text, "^(https?://)?chat.whatsapp.com/(?:invite/)?([a-zA-Z0-9_-]{22})$")==TRUE) %>% 
  count()

 #Survey links (most survey platforms use 'forms' in their URLs)
 chat %>% 
  filter(str_detect(text, "forms\\.")==TRUE) %>% 
  count()

 surveys <- chat %>%
  filter(str_detect(text, "forms\\.")== TRUE)

 print("Survey links shared:")
 print(nrow(surveys))
 ```

 Analysis of frequently mentioned topics or keywords

 ```{r keyword_analysis}
 # Define keywords to search for (customize based on your group's context)
 keywords <- c("meeting", "deadline", "project", "update", "help", "question")

 keyword_mentions <- data.frame(
  keyword = character(),
  mentions = numeric(),
  stringsAsFactors = FALSE
 )

 for(keyword in keywords) {
  count <- chat %>% 
    filter(str_detect(tolower(text), tolower(keyword))) %>% 
    nrow()
  
  keyword_mentions <- rbind(keyword_mentions, data.frame(keyword = keyword, mentions = count))
 }

 # Plot keyword frequency
 fig_keywords <- plot_ly(keyword_mentions, x = ~mentions, y = ~keyword, type = 'scatter', mode = 'markers', 
                        color = ~keyword, size = ~mentions,
                        marker = list(opacity = 1, sizemode = 'diameter'))
 fig_keywords <- fig_keywords %>% layout(
  xaxis = list(showgrid = FALSE, title = "Number of Mentions"),
  yaxis = list(showgrid = TRUE, title = "Keywords"),
  backgroundcolor = "#f2f2f2",
  showlegend = FALSE,
  title = "Keyword Frequency Analysis"
 )

 fig_keywords
 ```

 Messages by Day of the Week

 ```{r day_analysis}
 # MESSAGES PER DAY OF THE WEEK
 chat %>% 
  mutate(wday.num = wday(day),
         wday.name = weekdays(day)) %>% 
  group_by(timeblock, wday.num, wday.name) %>% 
  count() %>% 
  ggplot(aes(x = reorder(wday.name, -wday.num), y = n, fill=timeblock)) +
  geom_bar(stat = "identity") +
  ylab("Number of Messages") + xlab("Day of Week") +
  coord_flip() +
  ggtitle("Message Activity by Day of Week and Time Period") +
  theme_minimal() +
  theme(legend.title = element_blank(), 
        legend.position = "bottom")
 ```

 Question Analysis

 ```{r question_analysis}
 questions <- question_type(chat$text)  #Extract all questions automatically
 questions_text <- questions[["raw"]][["strip.text"]]  #store questions separately
 questions_text <- as.data.frame(questions_text)       #convert to dataframe

 #Categorize questions by type
 questions_type <- questions_text %>% 
  mutate(qtype = case_when(
    str_detect(questions_text, "who") ~ "who",
    str_detect(questions_text,"how") ~ "how",
    str_detect(questions_text, "whose") ~ "whose",
    str_detect(questions_text, "whom") ~ "whom",
    str_detect(questions_text, "where") ~ "where",
    str_detect(questions_text, "what") ~ "what",
    str_detect(questions_text, "which") ~ "which",
    str_detect(questions_text, "why") ~ "why",
    str_detect(questions_text, "when") ~ "when",
    str_detect(questions_text, "were") ~ "were",
    str_detect(questions_text, "was") ~ "was",
    str_detect(questions_text, "does") ~ "does",
    str_detect(questions_text, "did") ~ "did",
    str_detect(questions_text, "do") ~ "do",
    str_detect(questions_text, "\\bis\\b") ~ "is",
    str_detect(questions_text, "are") ~ "are",
    str_detect(questions_text, "will") ~ "will",
    str_detect(questions_text, "should") ~ "should",
    str_detect(questions_text, "could") ~ "could",
    str_detect(questions_text, "shall") ~ "shall",
    str_detect(questions_text, "might") ~ "might",
    str_detect(questions_text, "can") ~ "can",
    str_detect(questions_text, "has") ~ "has",
    str_detect(questions_text, "had") ~ "had",
    str_detect(questions_text, "huh") ~ "huh",
    str_detect(questions_text, "anyone") ~ "anyone",
    T ~ "other")) %>% 
  mutate(qtype = factor(qtype))

 #Summary of question types
 questions_breakdown <- questions_type %>% 
  count(qtype) %>% 
  arrange(-n)

 print("Question type breakdown:")
 print(questions_breakdown)
 ```

 N-gram Analysis for Common Phrases

 ```{r ngram_analysis}
 # Remove URLs and media messages for cleaner analysis
 url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

 chat_clean <- chat %>% 
  filter(str_detect(text, "<Media omitted>|This message was deleted|\\.vcf") == FALSE) %>% 
  filter(str_detect(text, url_pattern) == FALSE)

 # Generate trigrams (3-word phrases)
 trigrams <- chat_clean %>% 
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  filter(!is.na(trigram)) %>% 
  count(trigram, sort = TRUE)

 print("Most common 3-word phrases:")
 print(head(trigrams, 10))

 # Generate bigrams (2-word phrases)
 bigrams <- chat_clean %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) %>% 
  count(bigram, sort = TRUE)

 print("Most common 2-word phrases:")
 print(head(bigrams, 10))
 ```

 Daily Active Users Analysis

 ```{r daily_active_users}
 # Group by date and count number of distinct users
 daily_active_users <- chat %>%
  group_by(date) %>%
  summarise(active_users = n_distinct(author),
            total_messages = n()) %>%
  arrange(date)

 # Plot daily active users over time
 fig_dau <- plot_ly(daily_active_users, x = ~date, y = ~active_users, type = 'scatter', mode = 'lines+markers',
                   line = list(color = 'green', width = 3), 
                   marker = list(color = 'green', size = 6), 
                   width=1500, height=500) %>% 
  layout(title = "Daily Active Users Over Time",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Number of Active Users"))

 fig_dau
 ```

 Hourly Activity Pattern

 ```{r hourly_activity}
 # Analyze messaging patterns by hour of day
 hourly_activity <- chat %>%
  mutate(hour = hour(time)) %>%
  group_by(hour) %>%
  count() %>%
  arrange(hour)

 # Plot hourly activity
 fig_hourly <- plot_ly(hourly_activity, x = ~hour, y = ~n, type = 'bar',
                      marker = list(color = 'orange')) %>%
  layout(title = "Message Activity by Hour of Day",
         xaxis = list(title = "Hour of Day (24-hour format)"),
         yaxis = list(title = "Number of Messages"))

 fig_hourly
 ```

 Message Length Analysis

 ```{r message_length}
 # Calculate message length statistics
 chat_with_length <- chat %>%
  mutate(message_length = nchar(text)) %>%
  filter(message_length > 0)  # Remove empty messages

 length_stats <- chat_with_length %>%
  summarise(
    avg_length = mean(message_length, na.rm = TRUE),
    median_length = median(message_length, na.rm = TRUE),
    max_length = max(message_length, na.rm = TRUE),
    min_length = min(message_length, na.rm = TRUE)
  )

 print("Message length statistics:")
 print(length_stats)

 # Plot message length distribution
 fig_length <- plot_ly(chat_with_length, x = ~message_length, type = 'histogram',
                      marker = list(color = 'purple', opacity = 0.7)) %>%
  layout(title = "Distribution of Message Lengths",
         xaxis = list(title = "Message Length (characters)"),
         yaxis = list(title = "Frequency"))

 fig_length
 ```
	---
	title: "WhatsApp Group Chat Analysis"
	output:
	pdf_document: default
	---

	Hello folks, glad you could make it.
	This is the entire code (well, almost entire) used to create a visual essay on group
	WhatsApp chats. You can use bits of this to analyze any chat you want.

	The code is commented to the best of my abilities but if something is
	unclear, feel free to reach out!

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	#Read in the libraries
	library(rwhatsapp) #for cleaning the chats and making them into a dataframe
	library(tidytext) #for text processing
	library(tidyverse) #It'll be stupid not to use this.
	library(tidyr) #More data wrangling
	library(plotly) #Making interactive charts!
	library(qdap) #Among other text analysis things, I used this for counting questions
	library(lubridate) #date handling
	library(data.table)
	library(digest)
	library(htmlwidgets)
	library(stringr)
	library(textdata)
	library(prettydoc)
	```

	```{r whatsapp read in}
	# Replace with your own file path
	chat <- rwa_read("path/to/your/whatsapp/export.txt") %>%
	filter(!is.na(author)) # remove messages without author
	chat <- chat %>% select(-emoji, - source, -emoji_name)
	```

	The below chunk anonymizes the chats. This has already been done for the provided dataset.
	For your own chat, specify the column names you want to hash in the cols_to_mask variable.

	With hashing, names are converted to alphanumeric strings for privacy protection.

	```{r anonymize chats}
	#Which Columns Should This Affect?
	cols_to_mask <- c("author")

	anonymize <- function(x, algo="sha256") {
	sapply(x, function(y) if(y == "" \| is.na(y)) "" else digest(y, algo = algo))
	}

	setDT(chat)
	chat[, (cols_to_mask) := lapply(.SD, anonymize), .SDcols = cols_to_mask]
	```

	```{r data wrangling}
	#Separate Date components
	chat <- chat %>%
	mutate(day = day(time)) %>% #Pull out the day component from the time column
	mutate(times = strftime(chat$time, format="%H:%M:%S")) %>% #Pull out the time
	mutate(date = date(time)) #Pull out the date component

	#Separate by custom time periods (adjust dates as needed for your analysis)
	chat <- chat %>%
	mutate(
	# Message Segments - customize these time periods for your own analysis
	timeblock = case_when(
	time >= dmy("01/01/2023") & time <= dmy("31/01/2023") ~ "Period 1",
	time >= dmy("01/02/2023") & time <= dmy("28/02/2023") ~ "Period 2",
	time >= dmy("01/03/2023") & time <= dmy("31/03/2023") ~ "Period 3",
	time >= dmy("01/04/2023") & time <= dmy("30/04/2023") ~ "Period 4",
	time >= dmy("01/05/2023") & time <= dmy("31/05/2023") ~ "Period 5",
	time >= dmy("01/06/2023") & time <= dmy("30/06/2023") ~ "Period 6",
	time >= dmy("01/07/2023") & time <= dmy("31/07/2023") ~ "Period 7",
	time >= dmy("01/08/2023") & time <= dmy("31/08/2023") ~ "Period 8",
	time >= dmy("01/09/2023") & time <= dmy("30/09/2023") ~ "Period 9",
	time >= dmy("01/10/2023") & time <= dmy("31/10/2023") ~ "Period 10",
	time >= dmy("01/11/2023") & time <= dmy("30/11/2023") ~ "Period 11",
	time >= dmy("01/12/2023") & time <= dmy("31/12/2023") ~ "Period 12",
	T ~ "Other")) %>%
	mutate(timeblock = factor(timeblock)) #make this a factor so that you can count it

	#Classify the times into hourly intervals
	chat <- chat %>%
	mutate(dayblock = format(time, "%I %p")) %>%
	mutate(dayblock = factor(dayblock))
	```

	```{r timeline}
	#Timeline of Messages with Plotly
	counts <- data.frame(table(chat$day))
	fig <- plot_ly(counts, x = ~Var1, y = ~Freq, type = 'scatter', mode = 'lines+markers',
	line = list(color = 'royalblue', width = 4),
	marker = list(color = 'royalblue', width = .4),
	width=1500, height=500) %>%
	layout(title = "Messages over time - Group Chat Analysis",
	xaxis = list(title = "Date"),
	yaxis = list(title = "No. of Messages"))
	fig

	#Timeline by Time Periods
	units <- chat %>%
	group_by(date) %>%
	count(timeblock)
	```

	```{r people}
	people <- chat %>%
	count(author)

	#Plot messaging activity by user
	people$author <- factor(people$author, levels = unique(people$author)[order(people$n, decreasing = FALSE)])

	ax_x <- list(
	showticklabels = FALSE,
	title = "Author (anonymized for privacy)"
	)

	ax_y <- list(title = "Number of Messages")

	people_fig <- plot_ly(people, x = ~author, y = ~n, type = "bar", name = 'Messaging Activity')
	people_fig <- people_fig %>% layout(xaxis = ax_x, yaxis = ax_y, title = "Number of Messages by Group Members")

	people_fig

	#Export As HTML Widget (specify your own path)
	# saveWidget(people_fig, file="output/message_activity.html")
	```

	Sentiment analysis using the tidytext approach.
	This code is adapted from Julia Silge's excellent tidytextmining book:
	https://www.tidytextmining.com/sentiment.html

	```{r sentiment}
	#Unnest chat into one word per row, excluding WhatsApp service messages
	tidy_chat <- chat %>%
	filter(text != "This message was deleted\|<Media omitted>") %>%
	group_by(author, day) %>%
	ungroup() %>%
	unnest_tokens(word, text)

	#Use the Bing emotion lexicon to calculate sentiment
	chat_sentiment <- tidy_chat %>%
	inner_join(get_sentiments("bing")) %>%
	count(index = date, sentiment) %>%
	spread(sentiment, n, fill = 0) %>%
	mutate(sentiment = positive - negative)

	#Plot sentiment over time
	fig_sent <- plot_ly(chat_sentiment, x = ~index, y = ~sentiment, type = 'scatter', mode = 'lines+markers',
	line = list(color = 'royalblue', width = 4),
	marker = list(color = 'royalblue', width = .4),
	width=1500, height=500) %>%
	layout(title = "Sentiment Analysis Over Time",
	xaxis = list(title = "Date"),
	yaxis = list(title = "Sentiment Score"))
	fig_sent
	```

	Analysis of agreement patterns in the group chat.

	```{r agreement_analysis}
	#Count the Number of Messages per person
	chat_count <- chat %>%
	group_by(author) %>%
	count(author, name = "n_texts")

	#Group agreement expressions by Author and Time Period
	agreement_count <- chat %>%
	filter(text == "+1" \| text == "agree" \| text == "yes" \| text == "👍") %>%
	group_by(timeblock, author) %>%
	count(author, name = "agreements")

	#Percentage of agreement by author
	counts_by_author <- left_join(chat_count, agreement_count, by = c("author")) %>%
	filter(!is.na(agreements)) %>%
	mutate(agreement_perc = (agreements/n_texts)*100)

	#Round percentages
	counts_by_author$agreement_perc <- counts_by_author$agreement_perc %>% round(2)

	#Find users who only send agreement messages
	only_agreement_users <- counts_by_author %>%
	filter(agreements != "NA") %>%
	mutate(other_messages = n_texts - agreements) %>%
	filter(other_messages == 0)

	print("Users who only send agreement messages:")
	print(nrow(only_agreement_users))
	```

	Counting shared links and surveys in the group

	```{r links_and_surveys}
	#Number of WhatsApp group invite links
	chat %>%
	filter(str_detect(text, "^(https?://)?chat.whatsapp.com/(?:invite/)?([a-zA-Z0-9_-]{22})$")==TRUE) %>%
	count()

	#Survey links (most survey platforms use 'forms' in their URLs)
	chat %>%
	filter(str_detect(text, "forms\\.")==TRUE) %>%
	count()

	surveys <- chat %>%
	filter(str_detect(text, "forms\\.")== TRUE)

	print("Survey links shared:")
	print(nrow(surveys))
	```

	Analysis of frequently mentioned topics or keywords

	```{r keyword_analysis}
	# Define keywords to search for (customize based on your group's context)
	keywords <- c("meeting", "deadline", "project", "update", "help", "question")

	keyword_mentions <- data.frame(
	keyword = character(),
	mentions = numeric(),
	stringsAsFactors = FALSE
	)

	for(keyword in keywords) {
	count <- chat %>%
	filter(str_detect(tolower(text), tolower(keyword))) %>%
	nrow()

	keyword_mentions <- rbind(keyword_mentions, data.frame(keyword = keyword, mentions = count))
	}

	# Plot keyword frequency
	fig_keywords <- plot_ly(keyword_mentions, x = ~mentions, y = ~keyword, type = 'scatter', mode = 'markers',
	color = ~keyword, size = ~mentions,
	marker = list(opacity = 1, sizemode = 'diameter'))
	fig_keywords <- fig_keywords %>% layout(
	xaxis = list(showgrid = FALSE, title = "Number of Mentions"),
	yaxis = list(showgrid = TRUE, title = "Keywords"),
	backgroundcolor = "#f2f2f2",
	showlegend = FALSE,
	title = "Keyword Frequency Analysis"
	)

	fig_keywords
	```

	Messages by Day of the Week

	```{r day_analysis}
	# MESSAGES PER DAY OF THE WEEK
	chat %>%
	mutate(wday.num = wday(day),
	wday.name = weekdays(day)) %>%
	group_by(timeblock, wday.num, wday.name) %>%
	count() %>%
	ggplot(aes(x = reorder(wday.name, -wday.num), y = n, fill=timeblock)) +
	geom_bar(stat = "identity") +
	ylab("Number of Messages") + xlab("Day of Week") +
	coord_flip() +
	ggtitle("Message Activity by Day of Week and Time Period") +
	theme_minimal() +
	theme(legend.title = element_blank(),
	legend.position = "bottom")
	```

	Question Analysis

	```{r question_analysis}
	questions <- question_type(chat$text) #Extract all questions automatically
	questions_text <- questions[["raw"]][["strip.text"]] #store questions separately
	questions_text <- as.data.frame(questions_text) #convert to dataframe

	#Categorize questions by type
	questions_type <- questions_text %>%
	mutate(qtype = case_when(
	str_detect(questions_text, "who") ~ "who",
	str_detect(questions_text,"how") ~ "how",
	str_detect(questions_text, "whose") ~ "whose",
	str_detect(questions_text, "whom") ~ "whom",
	str_detect(questions_text, "where") ~ "where",
	str_detect(questions_text, "what") ~ "what",
	str_detect(questions_text, "which") ~ "which",
	str_detect(questions_text, "why") ~ "why",
	str_detect(questions_text, "when") ~ "when",
	str_detect(questions_text, "were") ~ "were",
	str_detect(questions_text, "was") ~ "was",
	str_detect(questions_text, "does") ~ "does",
	str_detect(questions_text, "did") ~ "did",
	str_detect(questions_text, "do") ~ "do",
	str_detect(questions_text, "\\bis\\b") ~ "is",
	str_detect(questions_text, "are") ~ "are",
	str_detect(questions_text, "will") ~ "will",
	str_detect(questions_text, "should") ~ "should",
	str_detect(questions_text, "could") ~ "could",
	str_detect(questions_text, "shall") ~ "shall",
	str_detect(questions_text, "might") ~ "might",
	str_detect(questions_text, "can") ~ "can",
	str_detect(questions_text, "has") ~ "has",
	str_detect(questions_text, "had") ~ "had",
	str_detect(questions_text, "huh") ~ "huh",
	str_detect(questions_text, "anyone") ~ "anyone",
	T ~ "other")) %>%
	mutate(qtype = factor(qtype))

	#Summary of question types
	questions_breakdown <- questions_type %>%
	count(qtype) %>%
	arrange(-n)

	print("Question type breakdown:")
	print(questions_breakdown)
	```

	N-gram Analysis for Common Phrases

	```{r ngram_analysis}
	# Remove URLs and media messages for cleaner analysis
	url_pattern <- "http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

	chat_clean <- chat %>%
	filter(str_detect(text, "<Media omitted>\|This message was deleted\|\\.vcf") == FALSE) %>%
	filter(str_detect(text, url_pattern) == FALSE)

	# Generate trigrams (3-word phrases)
	trigrams <- chat_clean %>%
	unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
	filter(!is.na(trigram)) %>%
	count(trigram, sort = TRUE)

	print("Most common 3-word phrases:")
	print(head(trigrams, 10))

	# Generate bigrams (2-word phrases)
	bigrams <- chat_clean %>%
	unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
	filter(!is.na(bigram)) %>%
	count(bigram, sort = TRUE)

	print("Most common 2-word phrases:")
	print(head(bigrams, 10))
	```

	Daily Active Users Analysis

	```{r daily_active_users}
	# Group by date and count number of distinct users
	daily_active_users <- chat %>%
	group_by(date) %>%
	summarise(active_users = n_distinct(author),
	total_messages = n()) %>%
	arrange(date)

	# Plot daily active users over time
	fig_dau <- plot_ly(daily_active_users, x = ~date, y = ~active_users, type = 'scatter', mode = 'lines+markers',
	line = list(color = 'green', width = 3),
	marker = list(color = 'green', size = 6),
	width=1500, height=500) %>%
	layout(title = "Daily Active Users Over Time",
	xaxis = list(title = "Date"),
	yaxis = list(title = "Number of Active Users"))

	fig_dau
	```

	Hourly Activity Pattern

	```{r hourly_activity}
	# Analyze messaging patterns by hour of day
	hourly_activity <- chat %>%
	mutate(hour = hour(time)) %>%
	group_by(hour) %>%
	count() %>%
	arrange(hour)

	# Plot hourly activity
	fig_hourly <- plot_ly(hourly_activity, x = ~hour, y = ~n, type = 'bar',
	marker = list(color = 'orange')) %>%
	layout(title = "Message Activity by Hour of Day",
	xaxis = list(title = "Hour of Day (24-hour format)"),
	yaxis = list(title = "Number of Messages"))

	fig_hourly
	```

	Message Length Analysis

	```{r message_length}
	# Calculate message length statistics
	chat_with_length <- chat %>%
	mutate(message_length = nchar(text)) %>%
	filter(message_length > 0) # Remove empty messages

	length_stats <- chat_with_length %>%
	summarise(
	avg_length = mean(message_length, na.rm = TRUE),
	median_length = median(message_length, na.rm = TRUE),
	max_length = max(message_length, na.rm = TRUE),
	min_length = min(message_length, na.rm = TRUE)
	)

	print("Message length statistics:")
	print(length_stats)

	# Plot message length distribution
	fig_length <- plot_ly(chat_with_length, x = ~message_length, type = 'histogram',
	marker = list(color = 'purple', opacity = 0.7)) %>%
	layout(title = "Distribution of Message Lengths",
	xaxis = list(title = "Message Length (characters)"),
	yaxis = list(title = "Frequency"))

	fig_length
	```