kylebutts · August 27, 2024 20:15
diff --git a/0.md b/0.md
diff --git a/post_selection_inference.R b/post_selection_inference.R
 #' This code shows the problem of post-selection inference following the
 #' review article [Post-Selection Inference](https://www.annualreviews.org/content/journals/10.1146/annurev-statistics-100421-044639)
 #'
 # %%
 library(tidyverse)
 library(fixest)

 #' Data generation process:
 #' - $X = (X_1, X_2, X_3)'$ is multi-variate normal with non-diagonal covaraince
 #' matrix
 #' - $Y$ is generated independently of $X$ (true coefficients of 0)
 #'
 # %%
 dgp <- function(n = 100) {
  X_vcov <- matrix(
    3 * c(1, 0.2, 0.4, 0.2, 1, 0.01, 0.4, 0.01, 1),
    nrow = 3, ncol = 3, byrow = TRUE
  )
  X <- mvtnorm::rmvnorm(n = n, mean = c(0, 0, 0), sigma = X_vcov)
  y <- rnorm(n = n, mean = 1, sd = 3)
  cbind(y, X) |>
    as_tibble() |>
    setNames(c("y", "X1", "X2", "X3"))
 }

 # %%
 #' Estimation strategy:
 #' - Do forward-step selection (start with intercept model) and add coefficients according to AIC-selection criteria
 #' - Then estimate linear model selected by forward-step selection
 #'
 forward_selection <- function(df) {
  # forward_model <- lm(y ~ X1 + X2 + X3, data = df)
  forward_model <- lm(y ~ 1, data = df)

  final <- step(
    forward_model,
    direction = "forward",
    scope = formula(y ~ X1 + X2 + X3),
    trace = 0
  )

  return(final)
 }

 #' Run this simulation B times
 # %%
 sim <- function(i) {
  df <- dgp(n = 100)

  selection_model <- forward_selection(df)

  selection_model |>
    summary() |>
    _$coefficients |>
    as_tibble(rownames = "term") |>
    setNames(c("term", "est", "se", "tvalue", "pvalue")) |>
    mutate(iter = .env$i, .before = 1)
 }

 res <- map(1:2500, function(i) {
  sim(i)
 }) |>
  list_rbind()

 #' *Conditional on $X_1$ being selected* as one of the covariates, what is the
 #' distribution of $\hat{\beta}_1$?
 #' Note that this means we throw out some simulated draws because they do not
 #' satisfy the model constraint ($X_1$ being selected)
 # %%
 res |>
  filter(term == "X1") |>
  ggplot() +
  geom_histogram(aes(x = est)) +
  theme_bw(base_size = 14)

 #' Now we are going to do sample-splitting to address this issue:
 #' - Using half of the data, we will select the model using forward-selection
 #' - Then, we will estimate the selected linear model using the other half of
 #' the data
 # %%
 sample_splitting_sim <- function(i) {
  df <- dgp(n = 100)

  train_idx <- seq_len(nrow(df)) |> sample(size = nrow(df) / 2, replace = FALSE)
  train_idx <- train_idx[order(train_idx)]
  test_idx <- setdiff(seq_len(nrow(df)), train_idx)

  # Select covariates
  selection_model <- forward_selection(df[train_idx, ])

  # Fit model on test data
  selection_model |>
    update(data = df[test_idx, ]) |>
    summary() |>
    _$coefficients |>
    as_tibble(rownames = "term") |>
    setNames(c("term", "est", "se", "tvalue", "pvalue")) |>
    mutate(iter = .env$i, .before = 1)
 }

 res_sample_split <- map(1:2500, function(i) {
  sample_splitting_sim(i)
 }) |>
  list_rbind()

 #' *Conditional on $X_1$ being selected* as one of the covariates, what is the
 #' distribution of $\hat{\beta}_1$ using this sample-splitting procedure?
 # %%
 res_sample_split |>
  filter(term == "X1") |>
  ggplot() +
  geom_histogram(aes(x = est)) +
  theme_bw(base_size = 14)


 #' While this helps restore normality of our estimated coefficient,
 #' the main cost is larger confidence intervals since we are fitting the model
 #' with $n / 2$ observations: (in this case, the sample distributions' variance
 #' should be $\sqrt{2} = 1.414$ times larger)
 # %%
 se <- res |>
  filter(term == "X1") |>
  pull(se) |>
  mean()

 se_sample_split <- res_sample_split |>
  filter(term == "X1") |>
  pull(se) |>
  mean()

 cat(sprintf(
  "The average standard error in the original simulatin is %0.3f. The average standard error in our sample-split estimate is %0.3f. This is %0.3f times larger.",
  se, se_sample_split, se_sample_split / se
 ))
	#' This code shows the problem of post-selection inference following the
	#' review article [Post-Selection Inference](https://www.annualreviews.org/content/journals/10.1146/annurev-statistics-100421-044639)
	#'
	# %%
	library(tidyverse)
	library(fixest)

	#' Data generation process:
	#' - $X = (X_1, X_2, X_3)'$ is multi-variate normal with non-diagonal covaraince
	#' matrix
	#' - $Y$ is generated independently of $X$ (true coefficients of 0)
	#'
	# %%
	dgp <- function(n = 100) {
	X_vcov <- matrix(
	3 * c(1, 0.2, 0.4, 0.2, 1, 0.01, 0.4, 0.01, 1),
	nrow = 3, ncol = 3, byrow = TRUE
	)
	X <- mvtnorm::rmvnorm(n = n, mean = c(0, 0, 0), sigma = X_vcov)
	y <- rnorm(n = n, mean = 1, sd = 3)
	cbind(y, X) \|>
	as_tibble() \|>
	setNames(c("y", "X1", "X2", "X3"))
	}

	# %%
	#' Estimation strategy:
	#' - Do forward-step selection (start with intercept model) and add coefficients according to AIC-selection criteria
	#' - Then estimate linear model selected by forward-step selection
	#'
	forward_selection <- function(df) {
	# forward_model <- lm(y ~ X1 + X2 + X3, data = df)
	forward_model <- lm(y ~ 1, data = df)

	final <- step(
	forward_model,
	direction = "forward",
	scope = formula(y ~ X1 + X2 + X3),
	trace = 0
	)

	return(final)
	}

	#' Run this simulation B times
	# %%
	sim <- function(i) {
	df <- dgp(n = 100)

	selection_model <- forward_selection(df)

	selection_model \|>
	summary() \|>
	_$coefficients \|>
	as_tibble(rownames = "term") \|>
	setNames(c("term", "est", "se", "tvalue", "pvalue")) \|>
	mutate(iter = .env$i, .before = 1)
	}

	res <- map(1:2500, function(i) {
	sim(i)
	}) \|>
	list_rbind()

	#' Conditional on $X_1$ being selected as one of the covariates, what is the
	#' distribution of $\hat{\beta}_1$?
	#' Note that this means we throw out some simulated draws because they do not
	#' satisfy the model constraint ($X_1$ being selected)
	# %%
	res \|>
	filter(term == "X1") \|>
	ggplot() +
	geom_histogram(aes(x = est)) +
	theme_bw(base_size = 14)

	#' Now we are going to do sample-splitting to address this issue:
	#' - Using half of the data, we will select the model using forward-selection
	#' - Then, we will estimate the selected linear model using the other half of
	#' the data
	# %%
	sample_splitting_sim <- function(i) {
	df <- dgp(n = 100)

	train_idx <- seq_len(nrow(df)) \|> sample(size = nrow(df) / 2, replace = FALSE)
	train_idx <- train_idx[order(train_idx)]
	test_idx <- setdiff(seq_len(nrow(df)), train_idx)

	# Select covariates
	selection_model <- forward_selection(df[train_idx, ])

	# Fit model on test data
	selection_model \|>
	update(data = df[test_idx, ]) \|>
	summary() \|>
	_$coefficients \|>
	as_tibble(rownames = "term") \|>
	setNames(c("term", "est", "se", "tvalue", "pvalue")) \|>
	mutate(iter = .env$i, .before = 1)
	}

	res_sample_split <- map(1:2500, function(i) {
	sample_splitting_sim(i)
	}) \|>
	list_rbind()

	#' Conditional on $X_1$ being selected as one of the covariates, what is the
	#' distribution of $\hat{\beta}_1$ using this sample-splitting procedure?
	# %%
	res_sample_split \|>
	filter(term == "X1") \|>
	ggplot() +
	geom_histogram(aes(x = est)) +
	theme_bw(base_size = 14)


	#' While this helps restore normality of our estimated coefficient,
	#' the main cost is larger confidence intervals since we are fitting the model
	#' with $n / 2$ observations: (in this case, the sample distributions' variance
	#' should be $\sqrt{2} = 1.414$ times larger)
	# %%
	se <- res \|>
	filter(term == "X1") \|>
	pull(se) \|>
	mean()

	se_sample_split <- res_sample_split \|>
	filter(term == "X1") \|>
	pull(se) \|>
	mean()

	cat(sprintf(
	"The average standard error in the original simulatin is %0.3f. The average standard error in our sample-split estimate is %0.3f. This is %0.3f times larger.",
	se, se_sample_split, se_sample_split / se
	))