library(dplyr)
library(ggplot2)
library(ggh4x)
library(geomtextpath)
set.seed(12345)
N <- 150
data <- data.frame(response = c(rnorm(N, mean = 1, sd = 1),
rnorm(N, mean = 3, sd = 1),
rnorm(N, mean = 5, sd = 1)),
group = factor(rep(LETTERS[1:3], each=N)))
model <- lm(response ~ group, data = data)
data %>%
summarize(broom::tidy(shapiro.test(response)), .by = group) %>%
bind_rows(broom::tidy(shapiro.test(residuals(model))) %>%
mutate(group="Residuals")) -> resids
# show the 1:1 mapping
# data %>%
# mutate(group_centered_response = response - mean(response), .by = group) %>%
# mutate(residual = residuals(model)) %>%
# mutate(equal = all.equal(unname(group_centered_response), unname(residual)))
# all.equal needed as OLS estimates are a a little off by -6.661338e-16 ... -1.052491e-13
# it's a numerical, not theoretical issue.
data %>%
mutate(group_centered_response = response - mean(response), .by = group) %>%
ggplot(aes(x=group_centered_response, col=group)) +
geom_density(data = data %>% mutate(residual = residuals(model)),
aes(x=residual), inherit.aes = FALSE, fill="purple", col=NA, alpha=.3) +
geom_textdensity(aes(label= sprintf("group: %s", group)),
size = 5, fontface = 2,
hjust = "ymax", vjust = 0.3,
show.legend = FALSE)+
theme_light() +
geom_label(data = resids %>%
summarize(txt=paste("Shapiro-Wilk\n",
paste(sprintf("%s: %s",
group,
scales::pvalue(p.value, add_p = TRUE)),
collapse = "\n"))) %>%
mutate(group="Residuals"),
aes(x=-Inf, y=Inf, label=txt),
vjust="top", hjust="left",
col="black") +
annotate(geom="text", x=0, y=0.1, label="residuals", col="purple", size=6)+
labs(title = "Group-centered response vs. residuals", subtitle = sprintf("N = %d", N)) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) +
geom_vline(xintercept = 0, linetype="dashed", col="gray40") -> p1
bind_rows(
data.frame(distribution = "model residuals",
data = residuals(model)),
data.frame(distribution= "mixture of group raw",
data = data %>%
mutate(group_centered_response = response - mean(response), .by = group) %>%
pull(group_centered_response))) %>%
ggplot(aes(x=data, col=distribution)) +
geom_density() +
stat_theodensity(distri="norm", aes(col="Gaussian"), linewidth=1.5) +
scale_color_manual(values=c("Gaussian" = "red", "model residuals"="green", "mixture of group raw" ="blue")) +
theme_light() +
theme(panel.grid.minor = element_blank(),
legend.position = "top") +
labs(title = "Residuals vs. mixture of group-centered raw responses",
subtitle = "You can see only one empirical density as both 1:1 overlap") -> p2
patchwork::wrap_plots(p1, p2, ncol = 1) +
patchwork::plot_annotation(title="ANOVA: Normality of per-group raw data vs. residuals",
theme = theme(plot.title = element_text(size=16, face = "bold")))
Created
May 30, 2025 15:25
-
-
Save adrianolszewski/78f9d637535438e5b35ea75b491a57b6 to your computer and use it in GitHub Desktop.
ANOVA: distribution of residuals = mixture of distribtions of per-group mean-centered raw data
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment