andrewheiss · November 16, 2025 08:33
diff --git a/the_office_sentiment_all_episodes.R b/the_office_sentiment_all_episodes.R
 library(tidyverse)
 library(tidytext)
 library(schrute)
 library(rcartocolor)
 library(ggh4x)

 # Get all the words as single rows
 all_words <- schrute::theoffice |> 
  mutate(season_cat = factor(season)) |> 
  unnest_tokens(output = word, input = text) |> 
  anti_join(stop_words, by = join_by(word))

 # Join the AFINN sentiment values to the words
 words_afinn <- all_words |> 
  inner_join(get_sentiments("afinn"), by = join_by(word))


 # Make the manual facet grid

 # PHEW this is convoluted! The seasons have different numbers of episodes *and*
 # some episodes are doubled (like S3E10 and S3E11). To get the grid to look nice
 # with one row per season, we need to make a matrix that has a value for each
 # episode and NA for gaps. Like, if there were three short seasons with 2, 3,
 # and 5 episodes each (and one of the episodes in the first season was doubled),
 # the matrix would need to look like this:
 #
 # 1  1  2  NA NA
 # 3  4  5  NA NA
 # 6  7  8  9  10
 #

 # Find all the seasons and episodes and make a sequential index
 episode_details <- theoffice |> 
  distinct(season, episode) |> 
  mutate(episode_number = 1:n())

 # Find the number of episodes in each season
 season_lengths <- episode_details |> 
  group_by(season) |> 
  summarize(max_ep = max(episode))

 # Make a new skeleton dataframe using the number of episodes in each season,
 # filling in gaps (so S3E10 and S3E11 are both overall episode 38)
 episode_details_doubled <- season_lengths |> 
  rowwise() |> 
  reframe(
    season = season,
    episode = 1:max_ep
  ) |> 
  left_join(episode_details, by = c("season", "episode")) |> 
  fill(episode_number, .direction = "down")

 # Build a matrix with rows for seasons and columns for episodes
 layout_matrix <- expand_grid(
  season = 1:nrow(season_lengths),
  episode = 1:max(season_lengths$max_ep)
 ) |>
  left_join(
    episode_details_doubled,
    by = c("season", "episode")
  ) |>
  arrange(season, episode) |>
  pull(episode_number) |>
  matrix(
    nrow = nrow(season_lengths),
    ncol = max(season_lengths$max_ep),
    byrow = TRUE
  )


 # Finally plot this thing using that layout for facets
 words_afinn |> 
  group_by(season_cat, episode, index = index %/% 30) |> 
  summarize(avg_sentiment = mean(value)) |> 
  mutate(season_episode = paste0(
    "S", season_cat, "E", str_pad(episode, width = 2, pad = "0")
  )) |> 
  ggplot(aes(x = index, y = avg_sentiment, fill = avg_sentiment)) + 
  geom_col() + 
  scale_fill_carto_c(
    palette = "Temps", 
    direction = -1, 
    limits = c(-5, 5), 
    guide = "none"
  ) +
  labs(
    x = NULL,
    y = NULL, 
    title = "Average sentiment across all 9 seasons of The Office",
    subtitle = "Each bar represents the average sentiment over 30 lines of dialogue",
    caption = "Source: {schrute}"
  ) +
  facet_manual(vars(season_episode), design = layout_matrix, scales = "free_x") +
  coord_cartesian(ylim = c(-1.5, 2.5)) +
  theme_void(base_family = "Archivo Narrow") +
  theme(
    strip.text = element_text(size = 6, hjust = 0),
    panel.background = element_rect(fill = "grey97"),
    plot.title = element_text(face = "bold", margin = margin(b = 4)),
    plot.subtitle = element_text(size = rel(0.9), margin = margin(b = 8)),
    plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "lines"),
    plot.caption = element_text(size = rel(0.7), hjust = 0)
  ) +
  ggview::canvas(width = 10, height = 5.5)
	library(tidyverse)
	library(tidytext)
	library(schrute)
	library(rcartocolor)
	library(ggh4x)

	# Get all the words as single rows
	all_words <- schrute::theoffice \|>
	mutate(season_cat = factor(season)) \|>
	unnest_tokens(output = word, input = text) \|>
	anti_join(stop_words, by = join_by(word))

	# Join the AFINN sentiment values to the words
	words_afinn <- all_words \|>
	inner_join(get_sentiments("afinn"), by = join_by(word))


	# Make the manual facet grid

	# PHEW this is convoluted! The seasons have different numbers of episodes and
	# some episodes are doubled (like S3E10 and S3E11). To get the grid to look nice
	# with one row per season, we need to make a matrix that has a value for each
	# episode and NA for gaps. Like, if there were three short seasons with 2, 3,
	# and 5 episodes each (and one of the episodes in the first season was doubled),
	# the matrix would need to look like this:
	#
	# 1 1 2 NA NA
	# 3 4 5 NA NA
	# 6 7 8 9 10
	#

	# Find all the seasons and episodes and make a sequential index
	episode_details <- theoffice \|>
	distinct(season, episode) \|>
	mutate(episode_number = 1:n())

	# Find the number of episodes in each season
	season_lengths <- episode_details \|>
	group_by(season) \|>
	summarize(max_ep = max(episode))

	# Make a new skeleton dataframe using the number of episodes in each season,
	# filling in gaps (so S3E10 and S3E11 are both overall episode 38)
	episode_details_doubled <- season_lengths \|>
	rowwise() \|>
	reframe(
	season = season,
	episode = 1:max_ep
	) \|>
	left_join(episode_details, by = c("season", "episode")) \|>
	fill(episode_number, .direction = "down")

	# Build a matrix with rows for seasons and columns for episodes
	layout_matrix <- expand_grid(
	season = 1:nrow(season_lengths),
	episode = 1:max(season_lengths$max_ep)
	) \|>
	left_join(
	episode_details_doubled,
	by = c("season", "episode")
	) \|>
	arrange(season, episode) \|>
	pull(episode_number) \|>
	matrix(
	nrow = nrow(season_lengths),
	ncol = max(season_lengths$max_ep),
	byrow = TRUE
	)


	# Finally plot this thing using that layout for facets
	words_afinn \|>
	group_by(season_cat, episode, index = index %/% 30) \|>
	summarize(avg_sentiment = mean(value)) \|>
	mutate(season_episode = paste0(
	"S", season_cat, "E", str_pad(episode, width = 2, pad = "0")
	)) \|>
	ggplot(aes(x = index, y = avg_sentiment, fill = avg_sentiment)) +
	geom_col() +
	scale_fill_carto_c(
	palette = "Temps",
	direction = -1,
	limits = c(-5, 5),
	guide = "none"
	) +
	labs(
	x = NULL,
	y = NULL,
	title = "Average sentiment across all 9 seasons of The Office",
	subtitle = "Each bar represents the average sentiment over 30 lines of dialogue",
	caption = "Source: {schrute}"
	) +
	facet_manual(vars(season_episode), design = layout_matrix, scales = "free_x") +
	coord_cartesian(ylim = c(-1.5, 2.5)) +
	theme_void(base_family = "Archivo Narrow") +
	theme(
	strip.text = element_text(size = 6, hjust = 0),
	panel.background = element_rect(fill = "grey97"),
	plot.title = element_text(face = "bold", margin = margin(b = 4)),
	plot.subtitle = element_text(size = rel(0.9), margin = margin(b = 8)),
	plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "lines"),
	plot.caption = element_text(size = rel(0.7), hjust = 0)
	) +
	ggview::canvas(width = 10, height = 5.5)
No results found