Skip to content

Instantly share code, notes, and snippets.

@tomsing1
Last active March 20, 2026 21:42
Show Gist options
  • Select an option

  • Save tomsing1/2c05a203e48fd8cac03d9a37100b6357 to your computer and use it in GitHub Desktop.

Select an option

Save tomsing1/2c05a203e48fd8cac03d9a37100b6357 to your computer and use it in GitHub Desktop.
ROUT statistical test for outliers in R
library(dplyr)
library(ggplot2)
library(OptimModel)
#------ Univariate: test a vector of values against a constant (e.g. the median)
m <- 33
stdev <- 5
set.seed(345)
df <- data.frame(
value = rnorm(36, mean = m, sd = stdev)
) |>
mutate(index = row_number())
# Inject two outliers
df$value[8] <- df$value[8] + 20
df$value[28] <- df$value[28] - 18
set.seed(123L)
# (ab)use the linear_model with only the intercept and a slope of zero
x_mat <- cbind(1, seq_along(df$value)) # two columns: intercept and index
fit <- rout_fitter(
# intercept, slope, log(sigma) => I am using robust statistics (median, MAD)
theta0 = c(median(df$value), 0, log(mad(df$value))),
f.model = linear_model,
x = x_mat,
y = df$value,
Q = 0.05
)
print(fit)
fit$outlier # nominal p-values
fit$outlier.adj # FDR corrected
point_df <- df |>
mutate(outlier = fit$outlier.adj)
ggplot(point_df, aes(x = "", y = value)) +
geom_boxplot(outlier.shape = NA, width = 0.3, fill = "grey90") +
geom_jitter(aes(shape = outlier, fill = outlier),
width = 0.08, size = 3, color = "black", stroke = 0.5) +
scale_shape_manual(values = c("FALSE" = 21, "TRUE" = 24),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")) +
scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")) +
labs(
title = "Univariate ROUT Outlier Detection (via Linear Model)",
x = NULL,
y = "Value",
shape = "Point type",
fill = "Point type"
) +
theme_bw() +
theme(legend.position = "bottom")
#------- Linear regression
# Simulate linear data: y = 2 + 1.5*x + noise
set.seed(42)
n <- 40
x_vec <- seq(1, 20, length.out = n)
y <- 2 + 1.5 * x_vec + rnorm(n, sd = 2)
# Inject two outliers (large effect)
y[38] <- y[38] - 20
y[40] <- y[40] - 28
# linear_model expects x as a matrix: cbind(intercept, slope)
x_mat <- cbind(1, x_vec)
# Fit with ROUT
fit <- rout_fitter(
theta0 = c(2, 1.5, log(2)),
f.model = linear_model,
x = x_mat,
y = y,
Q = 0.05
)
print(fit)
cat("Outlier indices:", which(fit$outlier.adj), "\n")
# --- Fitted curves ---
x_seq <- seq(min(x_vec), max(x_vec), length.out = 200)
# 1. ROUT fit
theta_rout <- fit$par[-length(fit$par)]
curve_rout <- data.frame(
x = x_seq,
y = linear_model(theta_rout, cbind(1, x_seq)),
model = "ROUT fit"
)
# 2. OLS on cleaned data (outliers removed)
x_clean <- x_vec[!fit$outlier.adj]
y_clean <- y[!fit$outlier.adj]
ols_clean <- lm(y_clean ~ x_clean)
curve_clean <- data.frame(
x = x_seq,
y = predict(ols_clean, newdata = data.frame(x_clean = x_seq)),
model = "OLS (outliers removed)"
)
# 3. OLS on full data (outliers retained)
ols_full <- lm(y ~ x_vec)
curve_full <- data.frame(
x = x_seq,
y = predict(ols_full, newdata = data.frame(x_vec = x_seq)),
model = "OLS (all data)"
)
curve_df <- rbind(curve_rout, curve_clean, curve_full)
# Data points
point_df <- data.frame(
x = x_vec,
y = y,
outlier = fit$outlier.adj
)
ggplot() +
geom_line(
data = curve_df,
aes(x = x, y = y, color = model, linetype = model),
linewidth = 1
) +
geom_point(
data = point_df,
aes(x = x, y = y, shape = outlier, fill = outlier),
size = 3,
color = "black",
stroke = 0.5
) +
scale_color_manual(
values = c(
"ROUT fit" = "steelblue",
"OLS (outliers removed)" = "darkorange",
"OLS (all data)" = "firebrick"
)
) +
scale_linetype_manual(
values = c(
"ROUT fit" = "solid",
"OLS (outliers removed)" = "dashed",
"OLS (all data)" = "dotted"
)
) +
scale_shape_manual(
values = c("FALSE" = 21, "TRUE" = 24),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
scale_fill_manual(
values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
labs(
title = "Linear Model Fit with ROUT Outlier Detection",
x = "x",
y = "y",
color = "Model",
linetype = "Model",
shape = "Point type",
fill = "Point type"
) +
theme_bw() +
theme(legend.position = "bottom")
#------- Polynomial (e.g. quadratic)
# Simulate quadratic data: y = 2 + 1.5*x - 0.1*x^2 + noise
set.seed(42)
n <- 50
x_vec <- seq(1, 20, length.out = n)
y <- 2 + 1.5 * x_vec - 0.1 * x_vec^2 + rnorm(n, sd = 1.5)
# Inject two outliers (larger effect this time)
y[15] <- y[15] + 25
y[38] <- y[38] - 22
# Define quadratic model: theta[1] + theta[2]*x + theta[3]*x^2
quad_model <- function(theta, x) theta[1] + theta[2] * x + theta[3] * x^2
# Fit with ROUT
fit <- rout_fitter(
theta0 = c(2, 1.5, -0.1, log(1.5)),
f.model = quad_model,
x = x_vec,
y = y,
Q = 0.05
)
print(fit)
cat("Outlier indices:", which(fit$outlier.adj), "\n")
# --- Fitted curves ---
x_seq <- seq(min(x_vec), max(x_vec), length.out = 200)
# 1. ROUT fit
theta_rout <- fit$par[-length(fit$par)]
curve_rout <- data.frame(
x = x_seq,
y = quad_model(theta_rout, x_seq),
model = "ROUT fit"
)
# 2. OLS on cleaned data (outliers removed)
x_clean <- x_vec[!fit$outlier.adj]
y_clean <- y[!fit$outlier.adj]
ols_clean <- lm(y_clean ~ x_clean + I(x_clean^2))
curve_clean <- data.frame(
x = x_seq,
y = predict(ols_clean, newdata = data.frame(x_clean = x_seq)),
model = "OLS (outliers removed)"
)
# 3. OLS on full data (outliers retained)
ols_full <- lm(y ~ x_vec + I(x_vec^2))
curve_full <- data.frame(
x = x_seq,
y = predict(ols_full, newdata = data.frame(x_vec = x_seq)),
model = "OLS (all data)"
)
curve_df <- rbind(curve_rout, curve_clean, curve_full)
# Data points
point_df <- data.frame(
x = x_vec,
y = y,
outlier = fit$outlier.adj
)
ggplot() +
geom_line(
data = curve_df,
aes(x = x, y = y, color = model, linetype = model),
linewidth = 1
) +
geom_point(
data = point_df,
aes(x = x, y = y, shape = outlier, fill = outlier),
size = 3,
color = "black",
stroke = 0.5
) +
scale_color_manual(
values = c(
"ROUT fit" = "steelblue",
"OLS (outliers removed)" = "darkorange",
"OLS (all data)" = "firebrick"
)
) +
scale_linetype_manual(
values = c(
"ROUT fit" = "solid",
"OLS (outliers removed)" = "dashed",
"OLS (all data)" = "dotted"
)
) +
scale_shape_manual(
values = c("FALSE" = 21, "TRUE" = 24),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
scale_fill_manual(
values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
labs(
title = "Quadratic Model Fit with ROUT Outlier Detection",
x = "x",
y = "y",
color = "Model",
linetype = "Model",
shape = "Point type",
fill = "Point type"
) +
theme_bw() +
theme(legend.position = "bottom")
#------- Hill model = 4-parameter logistic model or 4PL
# Note: the hill5_model (5PL) is preferred over 4PL when the upper and lower
# plateaus are approached at different rates.
# Simulate Hill model data
# Parameters: emin=0, emax=100, EC50=0.5 (so log(EC50)=log(0.5)), slope=2
set.seed(42)
theta_true <- c(emin = 0, emax = 100, lec50 = log(0.5), m = 2)
x <- rep(c(0, 2^(-4:4)), each = 4) # doses: 0, 0.0625, 0.125, ..., 16
y <- hill_model(theta_true, x) + rnorm(length(x), sd = 3)
# Inject two outliers
y[8] <- y[8] - 40
y[20] <- y[20] + 100
# Fit with ROUT
fit <- rout_fitter(
theta0 = c(0, 100, log(0.5), 2, log(3)),
f.model = hill_model,
x = x,
y = y,
Q = 0.05
)
print(fit)
which(fit$outlier.adj)
# --- Fitted curves ---
x_seq <- c(0, 2^seq(-4, 4, length.out = 200))
# 1. ROUT fit
theta_rout <- fit$par[-length(fit$par)]
curve_rout <- data.frame(
x = x_seq,
y = hill_model(theta_rout, x_seq),
model = "ROUT fit"
)
# 2. OLS on cleaned data (outliers removed)
fit_clean <- optim_fit(
theta0 = NULL,
f.model = hill_model,
x = x[!fit$outlier.adj],
y = y[!fit$outlier.adj]
)
curve_clean <- data.frame(
x = x_seq,
y = hill_model(coef(fit_clean), x_seq),
model = "OLS (outliers removed)"
)
# 3. OLS on full data (outliers retained)
fit_full <- optim_fit(theta0 = NULL, f.model = hill_model, x = x, y = y)
curve_full <- data.frame(
x = x_seq,
y = hill_model(coef(fit_full), x_seq),
model = "OLS (all data)"
)
curve_df <- rbind(curve_rout, curve_clean, curve_full)
# Data points
point_df <- data.frame(
x = x,
y = y,
outlier = fit$outlier.adj
)
ggplot() +
geom_line(
data = curve_df,
aes(x = x, y = y, color = model, linetype = model),
linewidth = 1
) +
geom_point(
data = point_df,
aes(x = x, y = y, shape = outlier, fill = outlier),
size = 3,
color = "black",
stroke = 0.5
) +
scale_color_manual(
values = c(
"ROUT fit" = "steelblue",
"OLS (outliers removed)" = "darkorange",
"OLS (all data)" = "firebrick"
)
) +
scale_linetype_manual(
values = c(
"ROUT fit" = "solid",
"OLS (outliers removed)" = "dashed",
"OLS (all data)" = "dotted"
)
) +
scale_shape_manual(
values = c("FALSE" = 21, "TRUE" = 24),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
scale_fill_manual(
values = c("FALSE" = "black", "TRUE" = "red"),
labels = c("FALSE" = "Normal", "TRUE" = "Outlier")
) +
scale_x_log10(breaks = c(0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16)) +
labs(
title = "Hill Model Fit with ROUT Outlier Detection",
x = "Dose (log scale)",
y = "Response",
color = "Model",
linetype = "Model",
shape = "Point type",
fill = "Point type"
) +
theme_bw() +
theme(legend.position = "bottom")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment