Last active
March 20, 2026 21:42
-
-
Save tomsing1/2c05a203e48fd8cac03d9a37100b6357 to your computer and use it in GitHub Desktop.
ROUT statistical test for outliers in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(dplyr) | |
| library(ggplot2) | |
| library(OptimModel) | |
| #------ Univariate: test a vector of values against a constant (e.g. the median) | |
| m <- 33 | |
| stdev <- 5 | |
| set.seed(345) | |
| df <- data.frame( | |
| value = rnorm(36, mean = m, sd = stdev) | |
| ) |> | |
| mutate(index = row_number()) | |
| # Inject two outliers | |
| df$value[8] <- df$value[8] + 20 | |
| df$value[28] <- df$value[28] - 18 | |
| set.seed(123L) | |
| # (ab)use the linear_model with only the intercept and a slope of zero | |
| x_mat <- cbind(1, seq_along(df$value)) # two columns: intercept and index | |
| fit <- rout_fitter( | |
| # intercept, slope, log(sigma) => I am using robust statistics (median, MAD) | |
| theta0 = c(median(df$value), 0, log(mad(df$value))), | |
| f.model = linear_model, | |
| x = x_mat, | |
| y = df$value, | |
| Q = 0.05 | |
| ) | |
| print(fit) | |
| fit$outlier # nominal p-values | |
| fit$outlier.adj # FDR corrected | |
| point_df <- df |> | |
| mutate(outlier = fit$outlier.adj) | |
| ggplot(point_df, aes(x = "", y = value)) + | |
| geom_boxplot(outlier.shape = NA, width = 0.3, fill = "grey90") + | |
| geom_jitter(aes(shape = outlier, fill = outlier), | |
| width = 0.08, size = 3, color = "black", stroke = 0.5) + | |
| scale_shape_manual(values = c("FALSE" = 21, "TRUE" = 24), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier")) + | |
| scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "red"), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier")) + | |
| labs( | |
| title = "Univariate ROUT Outlier Detection (via Linear Model)", | |
| x = NULL, | |
| y = "Value", | |
| shape = "Point type", | |
| fill = "Point type" | |
| ) + | |
| theme_bw() + | |
| theme(legend.position = "bottom") | |
| #------- Linear regression | |
| # Simulate linear data: y = 2 + 1.5*x + noise | |
| set.seed(42) | |
| n <- 40 | |
| x_vec <- seq(1, 20, length.out = n) | |
| y <- 2 + 1.5 * x_vec + rnorm(n, sd = 2) | |
| # Inject two outliers (large effect) | |
| y[38] <- y[38] - 20 | |
| y[40] <- y[40] - 28 | |
| # linear_model expects x as a matrix: cbind(intercept, slope) | |
| x_mat <- cbind(1, x_vec) | |
| # Fit with ROUT | |
| fit <- rout_fitter( | |
| theta0 = c(2, 1.5, log(2)), | |
| f.model = linear_model, | |
| x = x_mat, | |
| y = y, | |
| Q = 0.05 | |
| ) | |
| print(fit) | |
| cat("Outlier indices:", which(fit$outlier.adj), "\n") | |
| # --- Fitted curves --- | |
| x_seq <- seq(min(x_vec), max(x_vec), length.out = 200) | |
| # 1. ROUT fit | |
| theta_rout <- fit$par[-length(fit$par)] | |
| curve_rout <- data.frame( | |
| x = x_seq, | |
| y = linear_model(theta_rout, cbind(1, x_seq)), | |
| model = "ROUT fit" | |
| ) | |
| # 2. OLS on cleaned data (outliers removed) | |
| x_clean <- x_vec[!fit$outlier.adj] | |
| y_clean <- y[!fit$outlier.adj] | |
| ols_clean <- lm(y_clean ~ x_clean) | |
| curve_clean <- data.frame( | |
| x = x_seq, | |
| y = predict(ols_clean, newdata = data.frame(x_clean = x_seq)), | |
| model = "OLS (outliers removed)" | |
| ) | |
| # 3. OLS on full data (outliers retained) | |
| ols_full <- lm(y ~ x_vec) | |
| curve_full <- data.frame( | |
| x = x_seq, | |
| y = predict(ols_full, newdata = data.frame(x_vec = x_seq)), | |
| model = "OLS (all data)" | |
| ) | |
| curve_df <- rbind(curve_rout, curve_clean, curve_full) | |
| # Data points | |
| point_df <- data.frame( | |
| x = x_vec, | |
| y = y, | |
| outlier = fit$outlier.adj | |
| ) | |
| ggplot() + | |
| geom_line( | |
| data = curve_df, | |
| aes(x = x, y = y, color = model, linetype = model), | |
| linewidth = 1 | |
| ) + | |
| geom_point( | |
| data = point_df, | |
| aes(x = x, y = y, shape = outlier, fill = outlier), | |
| size = 3, | |
| color = "black", | |
| stroke = 0.5 | |
| ) + | |
| scale_color_manual( | |
| values = c( | |
| "ROUT fit" = "steelblue", | |
| "OLS (outliers removed)" = "darkorange", | |
| "OLS (all data)" = "firebrick" | |
| ) | |
| ) + | |
| scale_linetype_manual( | |
| values = c( | |
| "ROUT fit" = "solid", | |
| "OLS (outliers removed)" = "dashed", | |
| "OLS (all data)" = "dotted" | |
| ) | |
| ) + | |
| scale_shape_manual( | |
| values = c("FALSE" = 21, "TRUE" = 24), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| scale_fill_manual( | |
| values = c("FALSE" = "black", "TRUE" = "red"), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| labs( | |
| title = "Linear Model Fit with ROUT Outlier Detection", | |
| x = "x", | |
| y = "y", | |
| color = "Model", | |
| linetype = "Model", | |
| shape = "Point type", | |
| fill = "Point type" | |
| ) + | |
| theme_bw() + | |
| theme(legend.position = "bottom") | |
| #------- Polynomial (e.g. quadratic) | |
| # Simulate quadratic data: y = 2 + 1.5*x - 0.1*x^2 + noise | |
| set.seed(42) | |
| n <- 50 | |
| x_vec <- seq(1, 20, length.out = n) | |
| y <- 2 + 1.5 * x_vec - 0.1 * x_vec^2 + rnorm(n, sd = 1.5) | |
| # Inject two outliers (larger effect this time) | |
| y[15] <- y[15] + 25 | |
| y[38] <- y[38] - 22 | |
| # Define quadratic model: theta[1] + theta[2]*x + theta[3]*x^2 | |
| quad_model <- function(theta, x) theta[1] + theta[2] * x + theta[3] * x^2 | |
| # Fit with ROUT | |
| fit <- rout_fitter( | |
| theta0 = c(2, 1.5, -0.1, log(1.5)), | |
| f.model = quad_model, | |
| x = x_vec, | |
| y = y, | |
| Q = 0.05 | |
| ) | |
| print(fit) | |
| cat("Outlier indices:", which(fit$outlier.adj), "\n") | |
| # --- Fitted curves --- | |
| x_seq <- seq(min(x_vec), max(x_vec), length.out = 200) | |
| # 1. ROUT fit | |
| theta_rout <- fit$par[-length(fit$par)] | |
| curve_rout <- data.frame( | |
| x = x_seq, | |
| y = quad_model(theta_rout, x_seq), | |
| model = "ROUT fit" | |
| ) | |
| # 2. OLS on cleaned data (outliers removed) | |
| x_clean <- x_vec[!fit$outlier.adj] | |
| y_clean <- y[!fit$outlier.adj] | |
| ols_clean <- lm(y_clean ~ x_clean + I(x_clean^2)) | |
| curve_clean <- data.frame( | |
| x = x_seq, | |
| y = predict(ols_clean, newdata = data.frame(x_clean = x_seq)), | |
| model = "OLS (outliers removed)" | |
| ) | |
| # 3. OLS on full data (outliers retained) | |
| ols_full <- lm(y ~ x_vec + I(x_vec^2)) | |
| curve_full <- data.frame( | |
| x = x_seq, | |
| y = predict(ols_full, newdata = data.frame(x_vec = x_seq)), | |
| model = "OLS (all data)" | |
| ) | |
| curve_df <- rbind(curve_rout, curve_clean, curve_full) | |
| # Data points | |
| point_df <- data.frame( | |
| x = x_vec, | |
| y = y, | |
| outlier = fit$outlier.adj | |
| ) | |
| ggplot() + | |
| geom_line( | |
| data = curve_df, | |
| aes(x = x, y = y, color = model, linetype = model), | |
| linewidth = 1 | |
| ) + | |
| geom_point( | |
| data = point_df, | |
| aes(x = x, y = y, shape = outlier, fill = outlier), | |
| size = 3, | |
| color = "black", | |
| stroke = 0.5 | |
| ) + | |
| scale_color_manual( | |
| values = c( | |
| "ROUT fit" = "steelblue", | |
| "OLS (outliers removed)" = "darkorange", | |
| "OLS (all data)" = "firebrick" | |
| ) | |
| ) + | |
| scale_linetype_manual( | |
| values = c( | |
| "ROUT fit" = "solid", | |
| "OLS (outliers removed)" = "dashed", | |
| "OLS (all data)" = "dotted" | |
| ) | |
| ) + | |
| scale_shape_manual( | |
| values = c("FALSE" = 21, "TRUE" = 24), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| scale_fill_manual( | |
| values = c("FALSE" = "black", "TRUE" = "red"), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| labs( | |
| title = "Quadratic Model Fit with ROUT Outlier Detection", | |
| x = "x", | |
| y = "y", | |
| color = "Model", | |
| linetype = "Model", | |
| shape = "Point type", | |
| fill = "Point type" | |
| ) + | |
| theme_bw() + | |
| theme(legend.position = "bottom") | |
| #------- Hill model = 4-parameter logistic model or 4PL | |
| # Note: the hill5_model (5PL) is preferred over 4PL when the upper and lower | |
| # plateaus are approached at different rates. | |
| # Simulate Hill model data | |
| # Parameters: emin=0, emax=100, EC50=0.5 (so log(EC50)=log(0.5)), slope=2 | |
| set.seed(42) | |
| theta_true <- c(emin = 0, emax = 100, lec50 = log(0.5), m = 2) | |
| x <- rep(c(0, 2^(-4:4)), each = 4) # doses: 0, 0.0625, 0.125, ..., 16 | |
| y <- hill_model(theta_true, x) + rnorm(length(x), sd = 3) | |
| # Inject two outliers | |
| y[8] <- y[8] - 40 | |
| y[20] <- y[20] + 100 | |
| # Fit with ROUT | |
| fit <- rout_fitter( | |
| theta0 = c(0, 100, log(0.5), 2, log(3)), | |
| f.model = hill_model, | |
| x = x, | |
| y = y, | |
| Q = 0.05 | |
| ) | |
| print(fit) | |
| which(fit$outlier.adj) | |
| # --- Fitted curves --- | |
| x_seq <- c(0, 2^seq(-4, 4, length.out = 200)) | |
| # 1. ROUT fit | |
| theta_rout <- fit$par[-length(fit$par)] | |
| curve_rout <- data.frame( | |
| x = x_seq, | |
| y = hill_model(theta_rout, x_seq), | |
| model = "ROUT fit" | |
| ) | |
| # 2. OLS on cleaned data (outliers removed) | |
| fit_clean <- optim_fit( | |
| theta0 = NULL, | |
| f.model = hill_model, | |
| x = x[!fit$outlier.adj], | |
| y = y[!fit$outlier.adj] | |
| ) | |
| curve_clean <- data.frame( | |
| x = x_seq, | |
| y = hill_model(coef(fit_clean), x_seq), | |
| model = "OLS (outliers removed)" | |
| ) | |
| # 3. OLS on full data (outliers retained) | |
| fit_full <- optim_fit(theta0 = NULL, f.model = hill_model, x = x, y = y) | |
| curve_full <- data.frame( | |
| x = x_seq, | |
| y = hill_model(coef(fit_full), x_seq), | |
| model = "OLS (all data)" | |
| ) | |
| curve_df <- rbind(curve_rout, curve_clean, curve_full) | |
| # Data points | |
| point_df <- data.frame( | |
| x = x, | |
| y = y, | |
| outlier = fit$outlier.adj | |
| ) | |
| ggplot() + | |
| geom_line( | |
| data = curve_df, | |
| aes(x = x, y = y, color = model, linetype = model), | |
| linewidth = 1 | |
| ) + | |
| geom_point( | |
| data = point_df, | |
| aes(x = x, y = y, shape = outlier, fill = outlier), | |
| size = 3, | |
| color = "black", | |
| stroke = 0.5 | |
| ) + | |
| scale_color_manual( | |
| values = c( | |
| "ROUT fit" = "steelblue", | |
| "OLS (outliers removed)" = "darkorange", | |
| "OLS (all data)" = "firebrick" | |
| ) | |
| ) + | |
| scale_linetype_manual( | |
| values = c( | |
| "ROUT fit" = "solid", | |
| "OLS (outliers removed)" = "dashed", | |
| "OLS (all data)" = "dotted" | |
| ) | |
| ) + | |
| scale_shape_manual( | |
| values = c("FALSE" = 21, "TRUE" = 24), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| scale_fill_manual( | |
| values = c("FALSE" = "black", "TRUE" = "red"), | |
| labels = c("FALSE" = "Normal", "TRUE" = "Outlier") | |
| ) + | |
| scale_x_log10(breaks = c(0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16)) + | |
| labs( | |
| title = "Hill Model Fit with ROUT Outlier Detection", | |
| x = "Dose (log scale)", | |
| y = "Response", | |
| color = "Model", | |
| linetype = "Model", | |
| shape = "Point type", | |
| fill = "Point type" | |
| ) + | |
| theme_bw() + | |
| theme(legend.position = "bottom") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment