Skip to content

Instantly share code, notes, and snippets.

@h-a-graham
Last active January 8, 2025 15:29
Show Gist options
  • Save h-a-graham/f761168b849dc2c55976395d82c29838 to your computer and use it in GitHub Desktop.
Save h-a-graham/f761168b849dc2c55976395d82c29838 to your computer and use it in GitHub Desktop.
Spatial block bootstrap with clustered folds to estimate a global statistic (and CIs) for a raster.
suppressPackageStartupMessages({
  library(terra)
  library(ggplot2)
  library(spatialsample)
  library(rsample)
  library(sf)
  library(dplyr)
})


f <- system.file("ex/elev.tif", package = "terra")
r <- rast(f)

plot(r)

elev_df <- as.data.frame(r, xy = TRUE)
head(elev_df)
#>            x        y elevation
#> 127 6.004167 50.17917       529
#> 128 6.012500 50.17917       542
#> 129 6.020833 50.17917       547
#> 130 6.029167 50.17917       535
#> 218 5.970833 50.17083       485
#> 219 5.979167 50.17083       497
nrow(elev_df)
#> [1] 4608

elev_sf <- st_as_sf(elev_df, coords = c("x", "y")) |>
  sf::st_set_crs(terra::crs(r))

spcv <- spatial_clustering_cv(
  elev_sf,
  v = 50 # how many splits?
)

# look at the spatial clusers
autoplot(spcv) +
  theme_light() +
  theme(legend.position = "none")

bcv <- mutate(spcv, splits = purrr::map(splits, assessment))

# Function to perform spatial block bootstrapping
spatial_block_bootstrap <- function(blocks, times) {
  replicate(times,
    {
      sampled_blocks <- sample(blocks, replace = TRUE)
      do.call(rbind, sampled_blocks)
    },
    simplify = FALSE
  )
}

# Extract the blocks from the cross-validation object
blocks <- bcv$splits

# Perform spatial block bootstrapping
bdf_sp <- spatial_block_bootstrap(blocks, times = 100) |>
  bind_rows(.id = "replicate") |>
  mutate(
    bs_type = "spatial_block_bootstrap",
    id = replicate
  )

# Function to perform ordinary bootstrapping
bdf_ordinary <- bootstraps(elev_sf, times = 100) |>
  mutate(splits = purrr::map(splits, analysis)) |>
  tidyr::unnest(cols = splits) |>
  mutate(bs_type = "ordinary_bootstrap")

# Combine the two bootstrapped datasets and plot the results
bind_rows(bdf_sp, bdf_ordinary) |>
  ggplot() +
  aes(x = elevation, group = id) +
  stat_density(
    geom = "line", color = "#0f9e97", alpha = 0.1, position = "identity"
  ) +
  theme_light() +
  facet_wrap(~bs_type)

# Function to calculate the bootstrap estimate and confidence interval
bs_stat_ci <- function(x, f = sum) {
  bs_sum <- sf::st_drop_geometry(x) |>
    group_by(id) |>
    summarise(stat = f(elevation))

  tibble(
    stat = mean(bs_sum$stat),
    qi_low = quantile(bs_sum$stat, 0.025),
    qi_high = quantile(bs_sum$stat, 0.975),
    p_error = (qi_high - qi_low) / stat
  )
}

bs_stat_ci(bdf_sp)
#> # A tibble: 1 × 4
#>       stat   qi_low  qi_high p_error
#>      <dbl>    <dbl>    <dbl>   <dbl>
#> 1 1601492. 1441811. 1818061.   0.235
bs_stat_ci(bdf_ordinary)
#> # A tibble: 1 × 4
#>       stat   qi_low  qi_high p_error
#>      <dbl>    <dbl>    <dbl>   <dbl>
#> 1 1604672. 1595111. 1614591.  0.0121

Created on 2025-01-07 with reprex v2.1.1

suppressPackageStartupMessages({
  library(terra)
  library(ggplot2)
  library(sf)
  library(dplyr)
  library(rsample)
  library(mlr3spatiotempcv)
  library(mlr3)
  library(data.table)
})

# -- core functions ---

internal function to perform spatial block bootstrap param dt data.table param n_b number of bootstrap samples return data.table with bootstrap samples imports data.table

spatial_block_bootstrap <- function(dt, n_b, idc) {
  dt <- as.data.table(dt)
  unique_folds <- unique(dt$fold)
  # Sample folds with replacement
  bs_samp <- replicate(n_b,
    {
      sampled_folds <- sample(unique_folds, replace = TRUE)
      sampled_folds_dt <- data.table(fold = sampled_folds)
      bootstrap_sample <- dt[fold %in% sampled_folds]
      bootstrap_sample <- dt[sampled_folds_dt, on = .(fold), allow.cartesian = TRUE]
    },
    simplify = FALSE
  )
  data.table::rbindlist(bs_samp, idcol = idc)
}

generate spatial bootstrap sets param x sf object param n_folds number of folds param n_bootstrap number of bootstrap samples param idcol column name for bootstrap id param plot_folds logical, whether to plot the folds (useful for checks) return data.table with bootstrap samples imports mlr3spatiotempcv mlr3 data.table

sp_boot_sets <- function(x, n_folds = 50, n_bootstrap = 100,
                         idcol = "boot_id", plot_folds = TRUE) {
  x$row_id <- seq_len(nrow(x))

  x_tsk <- mlr3spatiotempcv::as_task_regr_st(
    x,
    target = "row_id"
  )
  # Instantiate Resampling
  rcv <- mlr3::rsmp("spcv_coords", folds = n_folds)
  rcv$instantiate(x_tsk)


  if (nrow(x) > 1e4) {
    sample_fold_n <- as.integer(ceiling(nrow(x) / n_folds * 0.1))
  } else {
    sample_fold_n <- NULL
  }

  p <- suppressMessages(
    autoplot(rcv, x_tsk, sample_fold_n = sample_fold_n) +
      scale_colour_viridis_d() +
      theme_light() +
      theme(legend.position = "none")
  )

  if (plot_folds) {
    print(p)
  }

  # left join with data.table between elev_sf and rsmp_idx by row_id
  x_sf_folds <- merge(as.data.table(x), rcv$instance, by = "row_id")
  spatial_block_bootstrap(x_sf_folds, n_bootstrap, idcol)
}


#--- main code ---

f <- system.file("ex/elev.tif", package = "terra")
r <- rast(f)
cent_coor <- terra::geom(
  terra::centroids(terra::as.polygons(terra::ext(r)))
)[1, ][c("x", "y")]

r <- terra::project(r, glue::glue(
  "+proj=laea +lon_0={cent_coor[1]} +lat_0={cent_coor[2]}",
  "+ellps=WGS84 +no_defs",
  .sep = " "
))

plot(r)

elev_sf <- as.data.frame(r, xy = TRUE) |>
  st_as_sf(coords = c("x", "y")) |>
  sf::st_set_crs(terra::crs(r))


bootstrap_sample_dt <- sp_boot_sets(elev_sf)

bootstrap_sample_dt[, bs_type := "spatial_block_bootstrap"]


# Function to perform ordinary bootstrapping
bdf_ordinary_dt <- bootstraps(elev_sf, times = 100) |>
  mutate(splits = purrr::map(splits, analysis)) |>
  tidyr::unnest(cols = splits) |>
  mutate(
    bs_type = "ordinary_bootstrap",
    boot_id = as.integer(stringr::str_extract(id, "\\d+"))
  ) |>
  as.data.table()

# Combine the two bootstrapped datasets and plot the results
rbind(bootstrap_sample_dt, bdf_ordinary_dt, fill = TRUE) |>
  ggplot() +
  aes(x = elevation, group = boot_id) +
  stat_density(
    geom = "line", color = "#0f9e97", alpha = 0.1, position = "identity"
  ) +
  theme_light() +
  facet_wrap(~bs_type)

bs_stat_ci <- function(x, f = mean) {
  bs_sum <- sf::st_drop_geometry(x) |>
    group_by(boot_id) |>
    summarise(stat = f(elevation))

  tibble(
    stat = mean(bs_sum$stat),
    qi_low = quantile(bs_sum$stat, 0.025),
    qi_high = quantile(bs_sum$stat, 0.975),
    p_error = (qi_high - qi_low) / stat
  )
}

bs_stat_ci(bootstrap_sample_dt)
#> # A tibble: 1 × 4
#>    stat qi_low qi_high p_error
#>   <dbl>  <dbl>   <dbl>   <dbl>
#> 1  348.   333.    363.  0.0849
bs_stat_ci(bdf_ordinary_dt)
#> # A tibble: 1 × 4
#>    stat qi_low qi_high p_error
#>   <dbl>  <dbl>   <dbl>   <dbl>
#> 1  348.   346.    350.  0.0109

Created on 2025-01-07 with reprex v2.1.1

#' internal function to perform spatial block bootstrap
#' @param dt data.table
#' @param n_b number of bootstrap samples
#' @return data.table with bootstrap samples
#' @import data.table
#' @keywords internal
#' @noRd
spatial_block_bootstrap <- function(dt, tc, f, n_b, idc, progress) {
dt <- as.data.table(dt)
unique_folds <- unique(dt$fold)
# Sample folds with replacement
bs_samp <- purrr::map_dbl(
seq_len(n_b), function(x) {
# browser()
sampled_folds <- sample(unique_folds, replace = TRUE)
sampled_folds_dt <- data.table(fold = sampled_folds)
bootstrap_sample <- dt[
sampled_folds_dt,
on = .(fold),
allow.cartesian = TRUE
]
f(bootstrap_sample[[tc]])
},
.progress = progress
)
}
#' convert SpatRaster to sf
#' @param r SpatRaster
#' @return sf object
#' @keywords internal
#' @noRd
rast_to_sf <- function(r) {
checkmate::assert_class(r, "SpatRaster")
x <- as.data.frame(r, xy = TRUE) |>
sf::st_as_sf(coords = c("x", "y")) |>
sf::st_set_crs(terra::crs(r))
x$row_id <- seq_len(nrow(x))
return(x)
}
#' Generate spatial bootstrap statistics for a SpatRaster object.
#' @param x a SpatRaster object
#' @param n_folds number of folds
#' @param n_bootstrap number of bootstrap samples
#' @param idcol column name for bootstrap id
#' @param progress logical should progress be shown
#' @param sample_fold_n minimum number of points to plot in each for plotting
#' @return spatboot object with bootstrap results
#' @import mlr3spatiotempcv mlr3 data.table
#' @export
#' @details
#' This function calculates bootstrap statistics for a single band of a SpatRaster
#' object. It does this using spatially clustered bootstrap samples. The function
#' first converts the SpatRaster to an sf object, then uses the mlr3spatiotempcv
#' package to generate spatially clustered folds. The function then samples these
#' folds with replacement to generate bootstrap samples. The function returns a
#' `spatboot` object which contains the bootstrap samples, the bootstrap statistic,
#' the bootstrap confidence interval, the percentage confidence interval, and a ggplot
#' object of the folds.
spatial_cluster_bootstrap <- function(
x, target_col, f, n_folds = 100, n_bootstrap = 100, alpha = 0.05,
idcol = "boot_id", progress = TRUE, sample_fold_n = 1e3L) {
checkmate::assert_class(x, "SpatRaster")
checkmate::assert_character(target_col, max.len = 1)
checkmate::assert_function(f)
checkmate::assert_numeric(n_folds, lower = 3, max.len = 1)
checkmate::assert_numeric(n_bootstrap, lower = 1, max.len = 1)
checkmate::assert_character(idcol, max.len = 1)
checkmate::assert_logical(progress, max.len = 1)
checkmate::assert_integer(sample_fold_n, lower = 1, max.len = 1)
x <- rast_to_sf(x)
x_tsk <- mlr3spatiotempcv::as_task_regr_st(
x,
target = "row_id"
)
# Instantiate Resampling
rcv <- mlr3::rsmp("spcv_coords", folds = n_folds)
rcv$instantiate(x_tsk)
if (nrow(x) / n_folds < sample_fold_n) {
sample_fold_n <- NULL
}
p <- suppressMessages(
autoplot(rcv, x_tsk, sample_fold_n = sample_fold_n) +
scale_colour_viridis_d() +
theme_light() +
theme(legend.position = "none")
)
# left join with data.table between elev_sf and rsmp_idx by row_id
x_sf_folds <- merge(as.data.table(x), rcv$instance, by = "row_id")
boot_vec <- spatial_block_bootstrap(
x_sf_folds, target_col, f, n_bootstrap, idcol, progress
)
boot_stat <- mean(boot_vec)
boot_stat_ci_low <- quantile(boot_vec, alpha / 2)[[1]]
boot_stat_ci_high <- quantile(boot_vec, 1 - alpha / 2)[[1]]
sp_boot_obj <- list(
boot_results = boot_vec,
boot_stat = boot_stat,
boot_stat_ci_low = boot_stat_ci_low,
boot_stat_ci_high = boot_stat_ci_high,
boot_stat_ci_perc = ((boot_stat_ci_high - boot_stat_ci_low) / boot_stat) * 100,
fold_plot = p
)
class(sp_boot_obj) <- "spatboot"
return(sp_boot_obj)
}
@h-a-graham
Copy link
Author

h-a-graham commented Jan 7, 2025

Thought... Using kmeans for the blocks does not ensure equally sized replacement. Maybe this is okay for some stats like mean but maybe this could be a problem for sum. Can we create clusters of equally size?

@h-a-graham
Copy link
Author

An optimized version now added that uses mlr3 and data.tbale... should be a bit faster. I think if we stick to means and scale to get sums we should be grand here, right?

@h-a-graham
Copy link
Author

The final R script is the most efficient option - it doesn't return the full sets but rather the statistic requested. This is somewhat limited - the function e.g. mean must accept a single function which is the raster band name. so this works fine for simple statistics like mean and sum - sum still comes with some caveats as the replacement is unlikley to be equally sized.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment