alexhallam · August 12, 2024 13:03
diff --git a/optimal_subsample.R b/optimal_subsample.R
 N <- 1000
 x1 <- 5*rnorm(n=N)
 x2 <- 2*rnorm(n=N)
 y <- x1 + x2 + rnorm(n=N)
 # full data
 df <- tibble(group = 'a', y, x1, x2)

 # sample data to get "pilot estimators"
 tau = 0.5
 df_sample <- df |> sample_frac(.10)
 fit <- rq(y ~ x1 + x2 - 1, tau = tau, data = df_sample)

 # use weights from the small model to predict the outcomes from the full data
 df_example <- df |> mutate(y_hat = predict(fit, df)) |>
  mutate( 
            e = y - y_hat,
            indicator = ifelse(e < 0, 1, 0),
            magnitude_x = sqrt(rowSums(across(c(x1, x2))^2)), # square the features, sum, then root
            weight = abs(tau - indicator) * magnitude_x,
            normalized_weight_pi_lopt_beta = weight / sum(weight)
          )

 df_example |>
  write_csv('df_subsample_calculation_example.csv')
	N <- 1000
	x1 <- 5*rnorm(n=N)
	x2 <- 2*rnorm(n=N)
	y <- x1 + x2 + rnorm(n=N)
	# full data
	df <- tibble(group = 'a', y, x1, x2)

	# sample data to get "pilot estimators"
	tau = 0.5
	df_sample <- df \|> sample_frac(.10)
	fit <- rq(y ~ x1 + x2 - 1, tau = tau, data = df_sample)

	# use weights from the small model to predict the outcomes from the full data
	df_example <- df \|> mutate(y_hat = predict(fit, df)) \|>
	mutate(
	e = y - y_hat,
	indicator = ifelse(e < 0, 1, 0),
	magnitude_x = sqrt(rowSums(across(c(x1, x2))^2)), # square the features, sum, then root
	weight = abs(tau - indicator) * magnitude_x,
	normalized_weight_pi_lopt_beta = weight / sum(weight)
	)

	df_example \|>
	write_csv('df_subsample_calculation_example.csv')