-
-
Save h-a-graham/11977fc9a934e20e7b466fe83cb28383 to your computer and use it in GitHub Desktop.
#' Create a composite raster with a VRT pixel function | |
#' @param src_files a character vector with the paths to the source rasters. | |
#' @param outfile a character path to the output raster. | |
#' @param fun a character with the pixel function to apply. | |
#' @param t_srs a character with the target SRS. If empty string "", the spatial | |
#' reference of src_files[1] will be used. | |
#' @param vrt_options a character vector with additional options to pass to the | |
#' gdalbuildVRT command. | |
#' @param warp_options a character vector with additional options to pass to the | |
#' gdalwarp command. | |
#' @param config_options a named character vector with additional GDAL configuration | |
#' options. | |
#' @param quiet a logical indicating whether to suppress progress bar. | |
#' @return a character path to the output raster. | |
vrt_composite <- function( | |
src_files, | |
outfile, | |
fun = c("median", "mean"), | |
t_srs = "", | |
vrt_options = NULL, | |
warp_options = c( | |
"-multi", | |
"-overwrite", | |
"-co", "COMPRESS=DEFLATE", | |
"-co", "PREDICTOR=2", | |
"-co", "NUM_THREADS=ALL_CPUS" | |
), | |
config_options = c( | |
GDAL_VRT_ENABLE_PYTHON = "YES", | |
VSI_CACHE = "TRUE", | |
GDAL_CACHEMAX = "30%", | |
VSI_CACHE_SIZE = "10000000", | |
GDAL_HTTP_MULTIPLEX = "YES", | |
GDAL_INGESTED_BYTES_AT_OPEN = "32000", | |
GDAL_DISABLE_READDIR_ON_OPEN = "EMPTY_DIR", | |
GDAL_HTTP_VERSION = "2", | |
GDAL_HTTP_MERGE_CONSECUTIVE_RANGES = "YES", | |
GDAL_NUM_THREADS = "ALL_CPUS" | |
), | |
quiet = FALSE) { | |
fun <- rlang::arg_match(fun) | |
purrr::iwalk( | |
config_options, | |
~ gdalraster::set_config_option(.y, .x) | |
) | |
init_vrt_path <- tempfile(fileext = ".vrt") | |
init_vrt <- gdalraster::buildVRT( | |
init_vrt_path, | |
src_files, | |
cl_arg = vrt_options, | |
quiet = TRUE | |
) | |
tvrt <- xml2::read_xml(init_vrt_path) | |
bands <- xml2::xml_find_all(tvrt, "//VRTRasterBand") | |
purrr::walk(bands, function(x) { | |
xml2::xml_set_attr(x, "subClass", "VRTDerivedRasterBand") | |
# Add pixel function elements | |
xml2::xml_add_child(x, "PixelFunctionType", "median") | |
xml2::xml_add_child(x, "PixelFunctionLanguage", "Python") | |
xml2::xml_add_child(x, "PixelFunctionCode", glue::glue(" | |
import numpy as np | |
def median(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize, raster_ysize, buf_radius, gt, **kwargs): | |
out_ar[:] = np.nan{fun}(in_ar, axis=0) | |
")) | |
}) | |
# Write modified VRT | |
med_vrt <- tempfile(fileext = ".vrt") | |
xml2::write_xml(tvrt, med_vrt) | |
gdalraster::warp( | |
med_vrt, | |
outfile, | |
t_srs = t_srs, | |
cl_arg = warp_options, | |
quiet = quiet | |
) | |
return(outfile) | |
} |
I have done a re-work of what I did in hypertidy/tacky, I'm getting a table of geomedian values in memory for each output tile, and plotting that with terra. This would be better done via gdalraster, having an open dataset of final .tif that gets written to during the loop. This is probably done better in tacky (with targets) but it's easier to understand and experiment with like this. I'm shamelessly using packages of mine off github just because I'm used to the simplicity (you can tile with stars or terra, they give a slightly differnet output and grout does the same logically but closer to what GDAL does with blocks conceptually).
Cheers! happy we can do so much with gdalraster now, keen to do more
## we aren't doing random numers, and we aren't scared of crashing in rstudio
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)
library(terra)
library(reproj)
library(vaster) ## remotes::install_github(c("hypertidy/vaster", "hypertidy/grout")
library(grout)
library(sds)
loc_crs <- "+proj=laea +lon_0=-63.8 +lat_0=-8.8"
loc_ex <- c(-1, 1, -1, 1) * 5000
ex <- reproj::reproj_extent(loc_ex, "EPSG:4326", source = loc_crs)
srcurl <- sds::stacit(ex, date = c("2024-02-01", "2024-02-29"))
src <- jsonlite::fromJSON(srcurl)
## there are 51 of these
src$features$assets$red$href
epsg <- src$features$properties[["proj:epsg"]]
href <- src$features$assets$visual$href
## output tiling (this what we parallel over)
##outdim <- c(2048, 2048) ## remember we need to have the dim match the output so we could use res here
## but rast() can snap for us
template <- rast(ext(loc_ex), res = 20)
outdim <- dim(template)[2:1]
tiles <- grout::tile_index(grout::grout(outdim, loc_ex, blocksize = c(512, 512)))
to_rgb_table <- function(x) {
colnames(x) <- c("red", "green", "blue")
tibble::as_tibble(x)
}
wfun <- function(.x, .y, .z, te, t_srs, ts) {
gdalraster::warp(.x, .y, t_srs = t_srs, cl_arg = c("-te", te, "-ts", ts))
# browser()
terra::values(terra::rast(.y), dataframe = TRUE) |> tibble::as_tibble() |> setNames(c("red", "green", "blue")) |>
dplyr::mutate(cell = dplyr::row_number()) |> arrow::write_parquet(.z)
.z
}
vaster::plot_extent(loc_ex, asp = 1)
for (i in seq_len(nrow(tiles))) {
dst_files <- replicate(length(href), tempfile(fileext = ".tif", tmpdir = "./scratch"))
parquet_files <- replicate(length(href), tempfile(fileext = ".parquet", tmpdir = "./scratch"))
tile <- tiles[i, ]
te <- unlist(tile[c("xmin", "ymin", "xmax", "ymax")])
future_pmap(list(.x = dsn::vsicurl(href), .y = dst_files, .z = parquet_files), wfun,
te = te, t_srs = loc_crs, ts = unlist(tile[c("ncol", "nrow")]))
## this is the fastest way to group by on pixel cells, from fragmented parquet on disk
med <- duckdbfs::open_dataset(parquet_files) |> dplyr::group_by(cell) |> dplyr::summarise(red = stats::median(red, na.rm = TRUE),
green = stats::median(green, na.rm = TRUE),
blue = stats::median(blue, na.rm = TRUE)) |>
dplyr::arrange(cell) |> dplyr::select(red, green, blue) |>
dplyr::collect()
r <- setValues(rast(dst_files[i]), med)
plotRGB(r, add = TRUE)
print(i)
}
plan(sequential)
in my way we also need to get the original red/green/blue not just the visual summary, and apply our own stretch to those 16 bit integers, but that's fairly straightforward too with gdalraster
I'm not sure the pixel function way can work, but maybe what you mean by median is different to what I'm thinking
I'm doing something entirely wrong now, but I did fix a lot of things mentioned above, so just code for the record
## we aren't doing random numers, and we aren't scared of crashing in rstudio
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)
library(terra)
library(reproj)
library(vaster) ## remotes::install_github(c("hypertidy/vaster", "hypertidy/grout")
library(grout)
library(sds)
# loc_crs <- "+proj=laea +lon_0=147 +lat_0=-42"
# loc_ex <- c(-1, 1, -1, 1) * 20000
loc_crs <- "+proj=laea +lon_0=-63.8 +lat_0=-8.8"
loc_ex <- c(-1, 1, -1, 1) * 5000
ex <- reproj::reproj_extent(loc_ex, "EPSG:4326", source = loc_crs)
srcurl <- sds::stacit(ex, date = c("2024-02-01", "2024-02-29"))
src <- jsonlite::fromJSON(srcurl)
## there are 10s of thes over 2 months, so think of blocksize below * this length
#src$features$assets$red$href
rgb <- cbind(src$features$assets$red$href,
src$features$assets$green$href,
src$features$assets$blue$href)
in_dsn <- replicate(nrow(rgb), tempfile(fileext = ".vrt", tmpdir = "./scratch"))
listof <- vector("list", nrow(rgb))
for (j in seq_along(in_dsn)) {
listof[[j]] <- sprintf("/vsicurl/%s", rgb[j, , drop = TRUE])
}
future_walk2( in_dsn, listof, \(.x, .y) gdalraster::buildVRT(.x, .y, cl_arg = c("-separate")))
## output tiling (this what we loop over, use parallel for the multiple inputs at each tile)
##outdim <- c(2048, 2048) ## remember we need to have the dim match the output so we could use res here
## but rast() can snap for us
template <- rast(ext(loc_ex), res = 20)
outdim <- dim(template)[2:1]
print(outdim)
tiles <- grout::tile_index(grout::grout(outdim, loc_ex, blocksize = c(512, 512)))
to_rgb_table <- function(x) {
colnames(x) <- c("red", "green", "blue")
tibble::as_tibble(x)
}
wfun <- function(.x, .y, .z, te, t_srs, ts) {
gdalraster::warp(.x, .y, t_srs = t_srs, cl_arg = c("-te", te, "-ts", ts, "-ot", "Int16"))
# browser()
terra::values(terra::rast(.y, raw = TRUE), dataframe = TRUE) |> tibble::as_tibble() |> setNames(c("red", "green", "blue")) |>
dplyr::mutate(cell = dplyr::row_number()) |> arrow::write_parquet(.z)
.z
}
vaster::plot_extent(loc_ex, asp = 1)
for (i in seq_len(nrow(tiles))) {
dst_files <- replicate(length(in_dsn), tempfile(fileext = ".tif", tmpdir = "./scratch"))
parquet_files <- replicate(length(in_dsn), tempfile(fileext = ".parquet", tmpdir = "./scratch"))
tile <- tiles[i, ]
te <- unlist(tile[c("xmin", "ymin", "xmax", "ymax")])
future_pmap(list(.x = in_dsn, .y = dst_files, .z = parquet_files), wfun,
te = te, t_srs = loc_crs, ts = unlist(tile[c("ncol", "nrow")]))
## this is the fastest way to group by on pixel cells, from fragmented parquet on disk
med <- duckdbfs::open_dataset(parquet_files) |> dplyr::group_by(cell) |> dplyr::summarise(red = stats::median(red, na.rm = TRUE),
green = stats::median(green, na.rm = TRUE),
blue = stats::median(blue, na.rm = TRUE)) |>
dplyr::arrange(cell) |> dplyr::select(red, green, blue) |>
dplyr::collect()
## these come out in 0,1 (and close to 0.45)
scaled <- tibble::as_tibble(lapply(med[c("red", "green", "blue")], scales::rescale, from = c(4500, 8000), to = c(0, 255)))
clamp <- function(x) {
x[x < 0] <- 0
x[x > 255] <- 255
x
}
for (ii in 1:3) scaled[[ii]] <- clamp(scaled[[ii]])
r <- setValues(rast(dst_files[i], raw = TRUE), scaled)
plotRGB(r, add = TRUE)
print(i)
}
ps don't run this code without care, clear out ./scratch each time. I was way too blithe and having problems with a very long list of files in a directory ...
Thanks so much for all of this will dig into this more asap!
e.g.