Last active
May 6, 2024 01:31
-
-
Save daltare/6185457d3d81ea59fe7c1730a7e40318 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This example shows one way to access subsets of CEDEN data via the California open data portal API (it's not | |
# necessarily the only or best way though) - if you find any problems or have questions, please contact: [email protected] | |
# This example applies to the following CEDEN datasets on the open data portal: | |
# Toxicity: https://data.ca.gov/dataset/surface-water-toxicity-results/resource/bd484e9b-426a-4ba6-ba4d-f5f8ce095836 | |
# Benthic Macroinvertebrates: https://data.ca.gov/dataset/surface-water-benthic-macroinvertebrate-results/resource/3dfee140-47d5-4e29-99ae-16b9b12a404f | |
# (NOTE: other CEDEN datasets are split by year, and cannont be queried across multiple years using this method -- for other CEDEN datasets, see: https://gist.github.com/daltare/2de1517ad1e315c4b1cad01278de96dd) | |
# This example assumes that you have your own API key saved in a system environment variable | |
# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your | |
# account page and look for the "API Key" header in the lower left side of the page) | |
# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query | |
# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API | |
# The function below returns a warning message when this occurs | |
# load packages | |
library(ckanr) | |
library(dplyr) | |
library(lubridate) | |
library(httr) | |
library(jsonlite) | |
library(glue) | |
library(readr) | |
library(purrr) | |
# set the option to use standard notation rather than scientific notation | |
options(scipen = 999) | |
# create a generic function to run an API query and return the formatted results, given a URL for the query | |
generic_api_query <- function(query_url) { | |
# encode URL | |
url_encoded <- URLencode(query_url) | |
# get the data | |
query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key'))) | |
query_char <- rawToChar(query_response$content) | |
query_content <- jsonlite::fromJSON(query_char) | |
resource_records <- query_content$result$records | |
# get information about the fields included in the dataset (allows for reording the fields to match the original dataset) | |
resource_fields <- query_content$result$fields %>% | |
filter(!id %in% c('_id', '_full_text')) %>% # drop these two fields, which are added by the portal and not actually part of the dataset | |
pull(id) | |
# format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset | |
resource_records <- tibble(resource_records) %>% | |
type_convert(na = c("", "NaN")) %>% | |
select(all_of(resource_fields)) | |
# print a warning if the API limit was reached by the query | |
if (nrow(resource_records) == 32000) { | |
warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned') | |
} | |
# return the results | |
return(resource_records) | |
} | |
# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields | |
get_api_data <- function(resource_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) { | |
# NOTES: any of the filters_... variables can be left as NA to ignore them | |
# for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value') | |
# for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value') | |
# for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name') | |
# format the filter for minimum/starting dates | |
if (!is.na(filters_date_start[1])) { | |
query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_date_start <- c() | |
} | |
# format the filters for maximum/ending dates | |
if (!is.na(filters_date_end[1])) { | |
query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_date_end <- c() | |
} | |
# format the filters for text values | |
if (!is.na(filters_text[1])) { | |
query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_text_filter <- c() | |
} | |
# format the filters for numeric values | |
if (!is.na(filters_numeric[1])) { | |
query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_numeric_filter <- c() | |
} | |
# format the filters to drop missing/null numeric values | |
if (!is.na(filters_numeric_drop_na[1])) { | |
query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>% | |
# unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_numeric_drop_na <- c() | |
} | |
# combine all of the different filter types into one statement | |
filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ') | |
# create URL for the API call | |
api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"") | |
if (length(filters_combined) > 0) { # if there are additional filters, add them | |
api_url <- glue("{api_url} WHERE {filters_combined}") | |
} | |
# submit the API query using the constructed url, and return the resulting data frame | |
return(generic_api_query(api_url)) | |
} | |
#### EXAMPLE QUERY - GET DATA (using the toxicity dataset) #### | |
# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30, | |
# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is | |
# greater than 10 and is not missing/null) | |
df_toxicity <- get_api_data(resource_id = 'bd484e9b-426a-4ba6-ba4d-f5f8ce095836', | |
filters_date_start = c('SampleDate' = '2016-07-01'), | |
filters_date_end = c('SampleDate' = '2018-06-30'), | |
filters_text = c('Analyte' = 'Oxygen, Dissolved, Total', | |
'Unit' = 'mg/L'), | |
filters_numeric = c('Result' = '> 10'), | |
filters_numeric_drop_na = c('Result')) | |
#### EXAMPLE QUERY - VERIFY DATA #### | |
# verify that records with the correct analyte names and units were returned, and get the number of records | |
df_toxicity %>% count(Analyte, Unit) | |
# # A tibble: 1 x 3 | |
# Analyte Unit n | |
# <chr> <chr> <int> | |
# 1 Oxygen, Dissolved, Total mg/L 1463 | |
# Verify that the sample date range is correct | |
range(df_toxicity$SampleDate) | |
# [1] "2016-07-11 UTC" "2018-06-21 UTC" | |
# Verify that the result range is correct | |
range(df_toxicity$Result) | |
# [1] 10.07 88.00 | |
# view a summary of the entire dataset | |
glimpse(df_toxicity) | |
# Rows: 1,463 | |
# Columns: 88 | |
# $ Program <chr> "BASMAA Regional Monitoring Coalition", "BASMAA Regional M... | |
# $ ParentProject <chr> "BASMAA RMC Monitoring in WY2016", "BASMAA RMC Monitoring ... | |
# $ Project <chr> "ACCWP Creek Status Monitoring in WY2016", "SCVURPPP Pesti... | |
# $ StationName <chr> "Sausal at E.22nd", "San Tomas Aquino 250m u/s Mission Col... | |
# $ StationCode <chr> "204SAU030", "205STQ010", "205STQ010", "205STE021", "205ST... | |
# $ SampleDate <dttm> 2016-07-11, 2016-07-11, 2016-07-11, 2016-07-11, 2016-07-1... | |
# $ CollectionTime <dttm> 1899-12-30 12:40:00, 1899-12-30 14:30:00, 1899-12-30 14:3... | |
# $ LocationCode <chr> "Midchannel", "Bank", "Bank", "Bank", "Bank", "Midchannel"... | |
# $ CollectionDepth <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1... | |
# $ UnitCollectionDepth <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m"... | |
# $ SampleTypeCode <chr> "Grab", "Grab", "Grab", "FieldBLDup_Grab", "Grab", "Grab",... | |
# $ CollectionReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... | |
# $ LabReplicate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... | |
# $ ToxBatch <chr> "AMS_071216PP_C1_W_TOX", "KLI_071216_PP_C2_W_TOX", "KLI_07... | |
# $ LabSampleID <chr> "204SAU030-W-06", "205STQ010-W-01", "205STQ010-W-01", "250... | |
# $ MatrixName <chr> "samplewater", "samplewater", "samplewater", "samplewater"... | |
# $ MethodName <chr> "EPA 821/R-02-013", "EPA 821/R-02-013", "EPA 821/R-02-013"... | |
# $ ToxTestDurCode <chr> "7 days", "7 days", "4 days", "4 days", "4 days", "4 days"... | |
# $ OrganismName <chr> "Pimephales promelas", "Pimephales promelas", "Selenastrum... | |
# $ Analyte <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "O... | |
# $ Unit <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "m... | |
# $ WQSource <chr> "overlyingwater", "overlyingwater", "overlyingwater", "ove... | |
# $ TimePointName <chr> "initial", "initial", "final", "initial", "initial", "init... | |
# $ Result <dbl> 10.2, 12.0, 11.4, 10.1, 10.1, 11.2, 11.0, 11.5, 16.0, 15.0... | |
# $ ResultQualCode <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "="... | |
# $ ToxResultQACode <chr> "None", "None", "None", "None", "None", "None", "None", "N... | |
# $ QACode <chr> "None", "None", "None", "None", "None", "None", "None", "N... | |
# $ BatchVerificationCode <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR"... | |
# $ ComplianceCode <chr> "NR", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "NR... | |
# $ SampleComments <chr> NA, "Same as 205R01411, sampled 2015", "Same as 205R01411,... | |
# $ CollectionComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ ToxTestComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ TimePointComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ ToxResultComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ ToxBatchComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ EventCode <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ"... | |
# $ ProtocolCode <chr> "BASMAA_RMC_SOP2016v3", "BASMAA_RMC_SOP2016v3", "BASMAA_RM... | |
# $ SampleAgency <chr> "Applied Marine Sciences, Inc. California", "Santa Clara V... | |
# $ GroupSamples <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ CollectionMethodName <chr> "Water_Grab", "Water_Grab", "Water_Grab", "Water_Grab", "W... | |
# $ Latitude <dbl> 37.78566, 37.38654, 37.38654, 37.41056, 37.41056, 37.61729... | |
# $ Longitude <dbl> -122.2242, -121.9688, -121.9688, -122.0689, -122.0689, -12... | |
# $ CollectionDeviceDescription <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recor... | |
# $ PositionWaterColumn <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recor... | |
# $ TIENarrative <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ Dilution <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100... | |
# $ ToxPointMethod <chr> "ToxWQMeasurement", "ToxWQMeasurement", "ToxWQMeasurement"... | |
# $ Treatment <chr> "None", "None", "None", "None", "None", "None", "None", "N... | |
# $ UnitTreatment <chr> "none", "none", "none", "none", "none", "none", "none", "n... | |
# $ TreatmentConcentration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0... | |
# $ LabAgency <chr> "PER", "PER", "PER", "PER", "PER", "PER", "PER", "PER", "P... | |
# $ SubmittingAgency <chr> "AMS-CA", "SCVURPPP", "SCVURPPP", "SCVURPPP", "SCVURPPP", ... | |
# $ ToxBatchStartDate <chr> "Jul 12 2016 2:30PM", "Jul 12 2016 12:00AM", "Jul 12 2016... | |
# $ RefToxBatch <chr> "25990PPRTPER", "25990PPRTPER", "26019SCRTPER", "26021CHDR... | |
# $ OrganismAgeAtTestStart <chr> "<48 hrs", "<48 hrs", "6 days", "7 days", "7 days", "7 day... | |
# $ LabSubmissionCode <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"... | |
# $ OccupationMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Walk ... | |
# $ StartingBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "LB", ... | |
# $ DistanceFromBank <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N... | |
# $ UnitDistanceFromBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N... | |
# $ StreamWidth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N... | |
# $ UnitStreamWidth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N... | |
# $ StationWaterDepth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -88, N... | |
# $ UnitStationWaterDepth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "m", N... | |
# $ HydroMod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Pipes... | |
# $ HydroModLoc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "US", ... | |
# $ LocationDetailWQComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "upstr... | |
# $ PctControl <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ RepCount <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ Mean <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ StdDev <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ StatMethod <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ Probability <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ Alphalevel <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ EvalThreshold <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ MSD <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ SigEffectCode <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ ChannelWidth <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ UpstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ DownstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ TotalReach <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... | |
# $ CalculatedValue <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ PercentEffect <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN... | |
# $ SampleID <chr> "204SAU030-W-06", "205STQ010-W-01", "205STQ010-W-01", "250... | |
# $ DW_AnalyteName <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolv... | |
# $ DataQuality <chr> "Unknown data quality", "Unknown data quality", "Unknown d... | |
# $ DataQualityIndicator <chr> "BatchVerificationCode:NR", "BatchVerificationCode:NR", "B... | |
# $ Datum <chr> "WGS84", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment