Skip to content

Instantly share code, notes, and snippets.

@h-a-graham
Last active February 16, 2025 07:21
Show Gist options
  • Save h-a-graham/11977fc9a934e20e7b466fe83cb28383 to your computer and use it in GitHub Desktop.
Save h-a-graham/11977fc9a934e20e7b466fe83cb28383 to your computer and use it in GitHub Desktop.
create a composite raster using gdal's VRT and warp via {gdalraster}
#' Create a composite raster with a VRT pixel function
#' @param src_files a character vector with the paths to the source rasters.
#' @param outfile a character path to the output raster.
#' @param fun a character with the pixel function to apply.
#' @param t_srs a character with the target SRS. If empty string "", the spatial
#' reference of src_files[1] will be used.
#' @param vrt_options a character vector with additional options to pass to the
#' gdalbuildVRT command.
#' @param warp_options a character vector with additional options to pass to the
#' gdalwarp command.
#' @param config_options a named character vector with additional GDAL configuration
#' options.
#' @param quiet a logical indicating whether to suppress progress bar.
#' @return a character path to the output raster.
vrt_composite <- function(
src_files,
outfile,
fun = c("median", "mean"),
t_srs = "",
vrt_options = NULL,
warp_options = c(
"-multi",
"-overwrite",
"-co", "COMPRESS=DEFLATE",
"-co", "PREDICTOR=2",
"-co", "NUM_THREADS=ALL_CPUS"
),
config_options = c(
GDAL_VRT_ENABLE_PYTHON = "YES",
VSI_CACHE = "TRUE",
GDAL_CACHEMAX = "30%",
VSI_CACHE_SIZE = "10000000",
GDAL_HTTP_MULTIPLEX = "YES",
GDAL_INGESTED_BYTES_AT_OPEN = "32000",
GDAL_DISABLE_READDIR_ON_OPEN = "EMPTY_DIR",
GDAL_HTTP_VERSION = "2",
GDAL_HTTP_MERGE_CONSECUTIVE_RANGES = "YES",
GDAL_NUM_THREADS = "ALL_CPUS"
),
quiet = FALSE) {
fun <- rlang::arg_match(fun)
purrr::iwalk(
config_options,
~ gdalraster::set_config_option(.y, .x)
)
init_vrt_path <- tempfile(fileext = ".vrt")
init_vrt <- gdalraster::buildVRT(
init_vrt_path,
src_files,
cl_arg = vrt_options,
quiet = TRUE
)
tvrt <- xml2::read_xml(init_vrt_path)
bands <- xml2::xml_find_all(tvrt, "//VRTRasterBand")
purrr::walk(bands, function(x) {
xml2::xml_set_attr(x, "subClass", "VRTDerivedRasterBand")
# Add pixel function elements
xml2::xml_add_child(x, "PixelFunctionType", "median")
xml2::xml_add_child(x, "PixelFunctionLanguage", "Python")
xml2::xml_add_child(x, "PixelFunctionCode", glue::glue("
import numpy as np
def median(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize, raster_ysize, buf_radius, gt, **kwargs):
out_ar[:] = np.nan{fun}(in_ar, axis=0)
"))
})
# Write modified VRT
med_vrt <- tempfile(fileext = ".vrt")
xml2::write_xml(tvrt, med_vrt)
gdalraster::warp(
med_vrt,
outfile,
t_srs = t_srs,
cl_arg = warp_options,
quiet = quiet
)
return(outfile)
}
@h-a-graham
Copy link
Author

e.g.

vrt_composite(raster_sources, "median_raster.tif",
    vrt_options = c("-b", "2", "-b", "3", "-b", "4")
  )

@mdsumner
Copy link

mdsumner commented Feb 15, 2025

I have done a re-work of what I did in hypertidy/tacky, I'm getting a table of geomedian values in memory for each output tile, and plotting that with terra. This would be better done via gdalraster, having an open dataset of final .tif that gets written to during the loop. This is probably done better in tacky (with targets) but it's easier to understand and experiment with like this. I'm shamelessly using packages of mine off github just because I'm used to the simplicity (you can tile with stars or terra, they give a slightly differnet output and grout does the same logically but closer to what GDAL does with blocks conceptually).

Cheers! happy we can do so much with gdalraster now, keen to do more

## we aren't doing random numers, and we aren't scared of crashing in rstudio
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)

library(terra)
library(reproj)

library(vaster) ## remotes::install_github(c("hypertidy/vaster", "hypertidy/grout")
library(grout)
library(sds)

loc_crs <- "+proj=laea +lon_0=-63.8 +lat_0=-8.8"
loc_ex <- c(-1, 1, -1, 1) * 5000
ex <- reproj::reproj_extent(loc_ex, "EPSG:4326", source = loc_crs)
srcurl <- sds::stacit(ex, date = c("2024-02-01", "2024-02-29"))
src <- jsonlite::fromJSON(srcurl)
## there are 51 of these
src$features$assets$red$href

epsg <- src$features$properties[["proj:epsg"]]

href <- src$features$assets$visual$href

## output tiling (this what we parallel over)
##outdim <- c(2048, 2048)  ## remember we need to have the dim match the output so we could use res here 
## but rast() can snap for us
template <- rast(ext(loc_ex), res = 20)
outdim <- dim(template)[2:1]

tiles <- grout::tile_index(grout::grout(outdim, loc_ex, blocksize = c(512, 512)))
to_rgb_table <- function(x) {
  colnames(x) <- c("red", "green", "blue")
  tibble::as_tibble(x)
}
wfun <- function(.x, .y, .z, te, t_srs, ts) {
  gdalraster::warp(.x, .y, t_srs = t_srs, cl_arg = c("-te", te, "-ts", ts))
 # browser()
  terra::values(terra::rast(.y), dataframe = TRUE) |> tibble::as_tibble() |> setNames(c("red", "green", "blue")) |> 
    dplyr::mutate(cell = dplyr::row_number()) |> arrow::write_parquet(.z)
  .z
}

vaster::plot_extent(loc_ex, asp = 1)
for (i in seq_len(nrow(tiles))) {
  dst_files <- replicate(length(href), tempfile(fileext = ".tif", tmpdir = "./scratch"))
  
  parquet_files <- replicate(length(href), tempfile(fileext = ".parquet", tmpdir = "./scratch"))
  
   tile <- tiles[i, ]
   te <- unlist(tile[c("xmin", "ymin", "xmax", "ymax")])

   future_pmap(list(.x = dsn::vsicurl(href), .y = dst_files, .z = parquet_files), wfun,  
            te = te, t_srs = loc_crs, ts = unlist(tile[c("ncol", "nrow")]))

   ## this is the fastest way to group by on pixel cells, from fragmented parquet on disk
   med <- duckdbfs::open_dataset(parquet_files) |> dplyr::group_by(cell) |> dplyr::summarise(red = stats::median(red, na.rm = TRUE),
                                                                                          green = stats::median(green, na.rm = TRUE), 
                                                                                          blue = stats::median(blue, na.rm = TRUE)) |> 
  dplyr::arrange(cell) |>   dplyr::select(red, green, blue) |>
  dplyr::collect()

    r <- setValues(rast(dst_files[i]), med)
   plotRGB(r, add = TRUE) 
   print(i)
} 

plan(sequential)

image

@mdsumner
Copy link

in my way we also need to get the original red/green/blue not just the visual summary, and apply our own stretch to those 16 bit integers, but that's fairly straightforward too with gdalraster

I'm not sure the pixel function way can work, but maybe what you mean by median is different to what I'm thinking

@mdsumner
Copy link

I'm doing something entirely wrong now, but I did fix a lot of things mentioned above, so just code for the record

## we aren't doing random numers, and we aren't scared of crashing in rstudio
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)

library(terra)
library(reproj)

library(vaster) ## remotes::install_github(c("hypertidy/vaster", "hypertidy/grout")
library(grout)
library(sds)

# loc_crs <- "+proj=laea +lon_0=147 +lat_0=-42"
# loc_ex <- c(-1, 1, -1, 1) * 20000
loc_crs <- "+proj=laea +lon_0=-63.8 +lat_0=-8.8"
loc_ex <- c(-1, 1, -1, 1) * 5000

ex <- reproj::reproj_extent(loc_ex, "EPSG:4326", source = loc_crs)
srcurl <- sds::stacit(ex, date = c("2024-02-01", "2024-02-29"))
src <- jsonlite::fromJSON(srcurl)
## there are 10s of thes over 2 months, so think of blocksize below * this length
#src$features$assets$red$href


rgb <- cbind(src$features$assets$red$href, 
             src$features$assets$green$href, 
             src$features$assets$blue$href)

in_dsn <- replicate(nrow(rgb), tempfile(fileext = ".vrt", tmpdir = "./scratch"))
listof <- vector("list", nrow(rgb))
for (j in seq_along(in_dsn)) {
  listof[[j]] <- sprintf("/vsicurl/%s", rgb[j, , drop = TRUE])
}



future_walk2( in_dsn, listof, \(.x, .y) gdalraster::buildVRT(.x, .y, cl_arg = c("-separate")))


## output tiling (this what we loop over, use parallel for the multiple inputs at each tile)
##outdim <- c(2048, 2048)  ## remember we need to have the dim match the output so we could use res here 
## but rast() can snap for us
template <- rast(ext(loc_ex), res = 20)
outdim <- dim(template)[2:1]
print(outdim)

tiles <- grout::tile_index(grout::grout(outdim, loc_ex, blocksize = c(512, 512)))
to_rgb_table <- function(x) {
  colnames(x) <- c("red", "green", "blue")
  tibble::as_tibble(x)
}
wfun <- function(.x, .y, .z, te, t_srs, ts) {
  gdalraster::warp(.x, .y, t_srs = t_srs, cl_arg = c("-te", te, "-ts", ts, "-ot", "Int16"))
 # browser()
  terra::values(terra::rast(.y, raw = TRUE), dataframe = TRUE) |> tibble::as_tibble() |> setNames(c("red", "green", "blue")) |> 
    dplyr::mutate(cell = dplyr::row_number()) |> arrow::write_parquet(.z)
  .z  
}

vaster::plot_extent(loc_ex, asp = 1)
for (i in seq_len(nrow(tiles))) {
  dst_files <- replicate(length(in_dsn), tempfile(fileext = ".tif", tmpdir = "./scratch"))
  
  parquet_files <- replicate(length(in_dsn), tempfile(fileext = ".parquet", tmpdir = "./scratch"))
  
  tile <- tiles[i, ]
  te <- unlist(tile[c("xmin", "ymin", "xmax", "ymax")])

  future_pmap(list(.x = in_dsn, .y = dst_files, .z = parquet_files), wfun,  
            te = te, t_srs = loc_crs, ts = unlist(tile[c("ncol", "nrow")]))
  ## this is the fastest way to group by on pixel cells, from fragmented parquet on disk
  med <- duckdbfs::open_dataset(parquet_files) |> dplyr::group_by(cell) |> dplyr::summarise(red = stats::median(red, na.rm = TRUE),
                                                                                          green = stats::median(green, na.rm = TRUE), 
                                                                                          blue = stats::median(blue, na.rm = TRUE)) |> 
  dplyr::arrange(cell) |>   dplyr::select(red, green, blue) |>
  dplyr::collect()

  ## these come out in 0,1 (and close to 0.45)
  scaled <- tibble::as_tibble(lapply(med[c("red", "green", "blue")], scales::rescale, from = c(4500, 8000), to = c(0, 255)))
  clamp <- function(x) {
    x[x < 0] <- 0
    x[x > 255] <- 255
    x
  }
  for (ii in 1:3) scaled[[ii]] <- clamp(scaled[[ii]])
  r <- setValues(rast(dst_files[i], raw = TRUE), scaled)
  plotRGB(r, add = TRUE) 
  print(i)
} 


@mdsumner
Copy link

ps don't run this code without care, clear out ./scratch each time. I was way too blithe and having problems with a very long list of files in a directory ...

@h-a-graham
Copy link
Author

h-a-graham commented Feb 16, 2025

Thanks so much for all of this will dig into this more asap!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment