dblodgett-usgs · May 14, 2022 19:32 · dblodgett-usgs · May 14, 2022
diff --git a/pull_gagesii_dv.R b/pull_gagesii_dv.R
 library(sf)
 library(dplyr)
 library(dataRetrieval)
 library(RNetCDF)

 # https://water.usgs.gov/GIS/metadata/usgswrd/XML/gagesII_Sept2011.xml
 gagesii <- sf::read_sf("R/nexus_locations/gagesII_9322_point_shapefile/gagesII_9322_sept30_2011.shp")

 # Just look at reference gages with recent active status and more than 80 years of record.
 ref <- dplyr::filter(gagesii, CLASS == "Ref" & ACTIVE09 == "yes" & FLYRS1900 > 80)
 i<-1
 for(site in ref$STAID) {
  file <- paste0("out/", site, ".rds")
  if(!file.exists(file)) {
    try(
      # just builds a cache of data for later.
      saveRDS(dataRetrieval::readNWISdv(site, "00060", startDate = "1900-01-01"),
              file = file), silent = FALSE)
    print(paste(site, i))
  }
  i <- i + 1
 }

 outfiles <- list.files("out/")

 out_times <- seq(from = as.Date("1900-01-01"), 
                 to = as.Date("2022-05-14 00:00"), by = "day")

 out_data <- data.frame(matrix(numeric(0), 
                              nrow = length(out_times), 
                              ncol = length(outfiles)))

 out_meta <- data.frame(matrix(character(0), 
                              nrow = length(out_times), 
                              ncol = length(outfiles)), stringsAsFactors = FALSE)

 names(out_data) <- names(out_meta) <- gsub(".rds", "", outfiles)

 max_gap <- 5

 # now we can step through the data and populate our big array.
 i <- 0
 for(f in outfiles) {
  dat <- readRDS(file.path("out/", f))
  
  if(nrow(dat) > 0) {
    site <- dat$site_no[1]
    
    out_dat <- approx(dat$Date, dat$X_00060_00003, out_times, rule = 1)
    
    out_met <- data.frame(dateTime = out_times, seq = seq(1, length(out_times)))
    out_met_temp <- approx(out_met$dateTime, out_met$seq, dat$Date, method = "constant") %>%
      as.data.frame() %>%
      left_join(select(dat, dt = Date, cd = X_00060_00003_cd), by = c("x" = "dt")) %>%
      group_by(y) %>%
      filter(row_number() == 1) %>%
      ungroup() %>% select(-x)
    
    out_met <- left_join(out_met, out_met_temp, by = c("seq" = "y")) %>%
      select(-seq)
    
    # the method here fills missing record but we need to remove the stuff
    # that was interpolated in a large gap.
    diff_time <- diff(as.numeric(dat$Date))
    too_big_ind <- which(diff_time > max_gap)
    too_big_start <- dat$Date[too_big_ind]
    too_big_end <- dat$Date[too_big_ind + 1]
    
    for(gap in 1:length(too_big_start)) {
      test <- out_dat$x > too_big_start[gap] & out_dat$x < too_big_end[gap]
      out_dat$y[test] <- NA
      out_met$cd[test] <- ""
    }
    
    out_data[site] <- out_dat$y
    out_meta[site] <- out_met$cd
  }
  i<-i+1
  print(paste(i, length(outfiles)))
 }

 attributes <- data.frame(staid = names(out_data)) %>%
  left_join(st_set_geometry(ref, NULL), by = c("staid" = "STAID"))

 lats <- attributes$LAT_GAGE
 lons <- attributes$LNG_GAGE

 attributes <- select(attributes, 
                     station_name = STANAME, 
                     da_sqkm = DRAIN_SQKM,
                     hcdn_2009 = HCDN_2009)

 attributes$hcdn_2009[is.na(attributes$hcdn_2009)] <- "no"

 ncdfgeom::write_timeseries_dsg("top_nwis_dv.nc", 
                               instance_names = names(out_data), 
                               lats = lats, lons = lons, 
                               times = out_times, 
                               data = out_data, 
                               data_unit = "ft^3/s", 
                               data_prec = "float", 
                               data_metadata = list(name = "Discharge", 
                                                    long_name = "Observed daily river discharge in CFS"), 
                               overwrite = TRUE)

 ncdfgeom::write_timeseries_dsg("top_nwis_dv.nc", 
                               instance_names = names(out_meta), 
                               lats = lats, 
                               lons = lons,
                               times = out_times, 
                               data = out_meta, 
                               data_unit = "none", 
                               data_prec = "char", 
                               data_metadata = list(name = "quality_flags", 
                                    long_name = "NWIS quality flags"), 
                               add_to_existing = TRUE)

 ncdfgeom::write_attribute_data("top_nwis_dv.nc", attributes, 
                               units = c("none", "km^2", "none"))

 system("nccopy -k nc4 -d 1 -c instance/1,time/,char/,instance_name_char/,quality_flags_char/ top_nwis_dv.nc top_nwis_dv_2.nc")
 file.rename("top_nwis_dv_2.nc", "top_nwis_dv.nc")

 saveRDS(out_data, "out_data.rds")
 saveRDS(out_meta, "out_meta.rds")
 saveRDS(out_times, "out_times.rds")

 ##### Testing and exploration #####

 nwis_sites <- var.get.nc(nc_nwis, "instance_name")
 nwm_lookup <- var.get.nc(nc_nwis, "COMID")

 nwis_time <- utcal.nc(att.get.nc(nc_nwis, "time", "units"), 
                      var.get.nc(nc_nwis, "time"), "c")

 site <- nwis_sites[50]

 nwis_index <- which(nwis_sites == site)

 nwis_data <- data.frame(time = nwis_time, 
                        q = var.get.nc(nc_nwis, "Discharge", 
                                       start = c(1, nwis_index), 
                                       count = c(NA, 1), unpack = TRUE))

 has_data <- which(!is.na(nwis_data$q))

 start <- nwis_data$time[has_data[1]]
 end <- nwis_data$time[tail(has_data, 1)]

 p_nwis <- filter(nwis_data, time > start & time < end)

 plot(p_nwis$time, p_nwis$q)
	library(sf)
	library(dplyr)
	library(dataRetrieval)
	library(RNetCDF)

	# https://water.usgs.gov/GIS/metadata/usgswrd/XML/gagesII_Sept2011.xml
	gagesii <- sf::read_sf("R/nexus_locations/gagesII_9322_point_shapefile/gagesII_9322_sept30_2011.shp")

	# Just look at reference gages with recent active status and more than 80 years of record.
	ref <- dplyr::filter(gagesii, CLASS == "Ref" & ACTIVE09 == "yes" & FLYRS1900 > 80)
	i<-1
	for(site in ref$STAID) {
	file <- paste0("out/", site, ".rds")
	if(!file.exists(file)) {
	try(
	# just builds a cache of data for later.
	saveRDS(dataRetrieval::readNWISdv(site, "00060", startDate = "1900-01-01"),
	file = file), silent = FALSE)
	print(paste(site, i))
	}
	i <- i + 1
	}

	outfiles <- list.files("out/")

	out_times <- seq(from = as.Date("1900-01-01"),
	to = as.Date("2022-05-14 00:00"), by = "day")

	out_data <- data.frame(matrix(numeric(0),
	nrow = length(out_times),
	ncol = length(outfiles)))

	out_meta <- data.frame(matrix(character(0),
	nrow = length(out_times),
	ncol = length(outfiles)), stringsAsFactors = FALSE)

	names(out_data) <- names(out_meta) <- gsub(".rds", "", outfiles)

	max_gap <- 5

	# now we can step through the data and populate our big array.
	i <- 0
	for(f in outfiles) {
	dat <- readRDS(file.path("out/", f))

	if(nrow(dat) > 0) {
	site <- dat$site_no[1]

	out_dat <- approx(dat$Date, dat$X_00060_00003, out_times, rule = 1)

	out_met <- data.frame(dateTime = out_times, seq = seq(1, length(out_times)))
	out_met_temp <- approx(out_met$dateTime, out_met$seq, dat$Date, method = "constant") %>%
	as.data.frame() %>%
	left_join(select(dat, dt = Date, cd = X_00060_00003_cd), by = c("x" = "dt")) %>%
	group_by(y) %>%
	filter(row_number() == 1) %>%
	ungroup() %>% select(-x)

	out_met <- left_join(out_met, out_met_temp, by = c("seq" = "y")) %>%
	select(-seq)

	# the method here fills missing record but we need to remove the stuff
	# that was interpolated in a large gap.
	diff_time <- diff(as.numeric(dat$Date))
	too_big_ind <- which(diff_time > max_gap)
	too_big_start <- dat$Date[too_big_ind]
	too_big_end <- dat$Date[too_big_ind + 1]

	for(gap in 1:length(too_big_start)) {
	test <- out_dat$x > too_big_start[gap] & out_dat$x < too_big_end[gap]
	out_dat$y[test] <- NA
	out_met$cd[test] <- ""
	}

	out_data[site] <- out_dat$y
	out_meta[site] <- out_met$cd
	}
	i<-i+1
	print(paste(i, length(outfiles)))
	}

	attributes <- data.frame(staid = names(out_data)) %>%
	left_join(st_set_geometry(ref, NULL), by = c("staid" = "STAID"))

	lats <- attributes$LAT_GAGE
	lons <- attributes$LNG_GAGE

	attributes <- select(attributes,
	station_name = STANAME,
	da_sqkm = DRAIN_SQKM,
	hcdn_2009 = HCDN_2009)

	attributes$hcdn_2009[is.na(attributes$hcdn_2009)] <- "no"

	ncdfgeom::write_timeseries_dsg("top_nwis_dv.nc",
	instance_names = names(out_data),
	lats = lats, lons = lons,
	times = out_times,
	data = out_data,
	data_unit = "ft^3/s",
	data_prec = "float",
	data_metadata = list(name = "Discharge",
	long_name = "Observed daily river discharge in CFS"),
	overwrite = TRUE)

	ncdfgeom::write_timeseries_dsg("top_nwis_dv.nc",
	instance_names = names(out_meta),
	lats = lats,
	lons = lons,
	times = out_times,
	data = out_meta,
	data_unit = "none",
	data_prec = "char",
	data_metadata = list(name = "quality_flags",
	long_name = "NWIS quality flags"),
	add_to_existing = TRUE)

	ncdfgeom::write_attribute_data("top_nwis_dv.nc", attributes,
	units = c("none", "km^2", "none"))

	system("nccopy -k nc4 -d 1 -c instance/1,time/,char/,instance_name_char/,quality_flags_char/ top_nwis_dv.nc top_nwis_dv_2.nc")
	file.rename("top_nwis_dv_2.nc", "top_nwis_dv.nc")

	saveRDS(out_data, "out_data.rds")
	saveRDS(out_meta, "out_meta.rds")
	saveRDS(out_times, "out_times.rds")

	##### Testing and exploration #####

	nwis_sites <- var.get.nc(nc_nwis, "instance_name")
	nwm_lookup <- var.get.nc(nc_nwis, "COMID")

	nwis_time <- utcal.nc(att.get.nc(nc_nwis, "time", "units"),
	var.get.nc(nc_nwis, "time"), "c")

	site <- nwis_sites[50]

	nwis_index <- which(nwis_sites == site)

	nwis_data <- data.frame(time = nwis_time,
	q = var.get.nc(nc_nwis, "Discharge",
	start = c(1, nwis_index),
	count = c(NA, 1), unpack = TRUE))

	has_data <- which(!is.na(nwis_data$q))

	start <- nwis_data$time[has_data[1]]
	end <- nwis_data$time[tail(has_data, 1)]

	p_nwis <- filter(nwis_data, time > start & time < end)

	plot(p_nwis$time, p_nwis$q)