Skip to content

Instantly share code, notes, and snippets.

@andrewheiss
Last active November 16, 2025 08:33
Show Gist options
  • Select an option

  • Save andrewheiss/32c824a542b3fdea064e61c39edada81 to your computer and use it in GitHub Desktop.

Select an option

Save andrewheiss/32c824a542b3fdea064e61c39edada81 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(tidytext)
library(schrute)
library(rcartocolor)
library(ggh4x)
# Get all the words as single rows
all_words <- schrute::theoffice |>
mutate(season_cat = factor(season)) |>
unnest_tokens(output = word, input = text) |>
anti_join(stop_words, by = join_by(word))
# Join the AFINN sentiment values to the words
words_afinn <- all_words |>
inner_join(get_sentiments("afinn"), by = join_by(word))
# Make the manual facet grid
# PHEW this is convoluted! The seasons have different numbers of episodes *and*
# some episodes are doubled (like S3E10 and S3E11). To get the grid to look nice
# with one row per season, we need to make a matrix that has a value for each
# episode and NA for gaps. Like, if there were three short seasons with 2, 3,
# and 5 episodes each (and one of the episodes in the first season was doubled),
# the matrix would need to look like this:
#
# 1 1 2 NA NA
# 3 4 5 NA NA
# 6 7 8 9 10
#
# Find all the seasons and episodes and make a sequential index
episode_details <- theoffice |>
distinct(season, episode) |>
mutate(episode_number = 1:n())
# Find the number of episodes in each season
season_lengths <- episode_details |>
group_by(season) |>
summarize(max_ep = max(episode))
# Make a new skeleton dataframe using the number of episodes in each season,
# filling in gaps (so S3E10 and S3E11 are both overall episode 38)
episode_details_doubled <- season_lengths |>
rowwise() |>
reframe(
season = season,
episode = 1:max_ep
) |>
left_join(episode_details, by = c("season", "episode")) |>
fill(episode_number, .direction = "down")
# Build a matrix with rows for seasons and columns for episodes
layout_matrix <- expand_grid(
season = 1:nrow(season_lengths),
episode = 1:max(season_lengths$max_ep)
) |>
left_join(
episode_details_doubled,
by = c("season", "episode")
) |>
arrange(season, episode) |>
pull(episode_number) |>
matrix(
nrow = nrow(season_lengths),
ncol = max(season_lengths$max_ep),
byrow = TRUE
)
# Finally plot this thing using that layout for facets
words_afinn |>
group_by(season_cat, episode, index = index %/% 30) |>
summarize(avg_sentiment = mean(value)) |>
mutate(season_episode = paste0(
"S", season_cat, "E", str_pad(episode, width = 2, pad = "0")
)) |>
ggplot(aes(x = index, y = avg_sentiment, fill = avg_sentiment)) +
geom_col() +
scale_fill_carto_c(
palette = "Temps",
direction = -1,
limits = c(-5, 5),
guide = "none"
) +
labs(
x = NULL,
y = NULL,
title = "Average sentiment across all 9 seasons of The Office",
subtitle = "Each bar represents the average sentiment over 30 lines of dialogue",
caption = "Source: {schrute}"
) +
facet_manual(vars(season_episode), design = layout_matrix, scales = "free_x") +
coord_cartesian(ylim = c(-1.5, 2.5)) +
theme_void(base_family = "Archivo Narrow") +
theme(
strip.text = element_text(size = 6, hjust = 0),
panel.background = element_rect(fill = "grey97"),
plot.title = element_text(face = "bold", margin = margin(b = 4)),
plot.subtitle = element_text(size = rel(0.9), margin = margin(b = 8)),
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), units = "lines"),
plot.caption = element_text(size = rel(0.7), hjust = 0)
) +
ggview::canvas(width = 10, height = 5.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment