mbjoseph · February 25, 2021 23:40
diff --git a/minimal-wildfire-brms.R b/minimal-wildfire-brms.R
 library(tidyverse)
 library(sf)
 library(here)
 library(brms)
 library(lubridate)
 library(reshape2)

 # Get ecoregion data ------------------------------------------------------
 download.file("ftp://newftp.epa.gov/EPADataCommons/ORD/Ecoregions/us/us_eco_l4.zip", 
              destfile = "ecoregions.zip")
 unzip("ecoregions.zip")
 ecoregions <- st_read("us_eco_l4_no_st.shp") %>%
  st_make_valid() %>%
  # summarize to level 3 regions (assuming we don't want to go down to level 4)
  group_by(L3_KEY, L2_KEY, L1_KEY) %>%
  summarize()

 ecoregions <- ecoregions %>%
  ungroup %>%
  mutate(eco_area_sq_m = st_area(ecoregions), 
         eco_area_sq_km = as.numeric(eco_area_sq_m) / 1000000)

 ecoregions %>%
  ggplot() + 
  geom_sf()


 # Get MTBS data -----------------------------------------------------------
 download.file("https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/MTBS_Fire/data/composite_data/fod_pt_shapefile/mtbs_fod_pts_data.zip", 
              destfile = "mtbs.zip")
 unzip("mtbs.zip")
 mtbs <- st_read("mtbs_FODpoints_DD.shp") %>%
  st_transform(st_crs(ecoregions)) %>%
  filter(Incid_Type == "Wildfire", 
         BurnBndAc > 1e3) %>%
  mutate(year = year(Ig_Date), 
         month = month(Ig_Date))

 mtbs %>%
  ggplot() + 
  geom_sf()


 # Find the ecoregions for each MTBS event ---------------------------------
 mtbs_eco <- st_intersection(mtbs, ecoregions)

 mtbs_eco  %>%
  filter(grepl("GREAT PLAINS", L1_KEY))%>%
  ggplot() + 
  geom_sf(aes(color = L2_KEY))



 # Generate clean count and size data sets  ----------------------

 nonzero_counts <- mtbs_eco %>%
  as_tibble %>%
  count(year, month, L3_KEY)

 ecoregion_keys <- ecoregions %>%
  as_tibble %>%
  select(ends_with("KEY"), eco_area_sq_km)

 # start with a small subset of the data to reduce model run times
 focal_l1_region <- "GREAT PLAINS"
 max_year_train_set <- 2010


 all_counts <- expand.grid(year = 1984:2018, 
                          month = 1:12, 
                          L3_KEY = levels(ecoregions$L3_KEY)) %>%
  as_tibble %>%
  filter(!(year == 1984 & month == 1)) %>% # Jan 1984 is not in MTBS
  left_join(nonzero_counts) %>%
  mutate(n = ifelse(is.na(n), 0, n)) %>%
  left_join(ecoregion_keys) 


 train_counts <- all_counts %>%
  filter(
    grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion 
    year < max_year_train_set # start with a subset of years
    )

 train_sizes <- mtbs_eco %>%
  as_tibble %>%
  filter(
    grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion 
    year < max_year_train_set # start with a subset of years
  )

 test_sizes <- mtbs_eco %>%
  as_tibble %>%
  filter(grepl(focal_l1_region, L1_KEY)) %>%
  anti_join(train_sizes)

 # Fit a count model -------------------------------------------------------

 # the smoother with bs = "cc" is a cyclic spline smoother (essentially an 
 # easy way to account for seasonality, though in a real model it would be 
 # better to include climate data instead)
 count_model <- brm(n ~ s(month, bs = "cc") + (1 | L2_KEY) + (1 | L3_KEY) + 
                     offset(log(eco_area_sq_km)),
  data = train_counts, 
  family = zero_inflated_negbinomial(), 
  cores = 4
  )

 summary(count_model)

 plot(conditional_effects(count_model), ask = FALSE)


 # Fit a fire size model ---------------------------------------------------

 size_model <- brm(BurnBndAc ~ s(month, bs = "cc") + (1 | L2_KEY) + (1 | L3_KEY),
  data = train_sizes, 
  family = lognormal(), 
  cores = 4)

 summary(size_model)

 plot(conditional_effects(size_model), ask = FALSE)



 # Combine outputs from count and size model to infer total burn area ------

 test_counts <- anti_join(all_counts, train_counts) %>%
  filter(L3_KEY %in% train_counts$L3_KEY) %>%
  mutate(test_row_index = 1:n())


 # generate a set of count predictions for each row in the test data
 count_predictions <- predict(count_model, 
                             summary = FALSE, 
                             newdata = test_counts, 
                             allow_new_levels = TRUE) %>%
  reshape2::melt(varnames = c("iter", "test_row_index")) %>%
  as_tibble %>%
  filter(value > 0) # we only need to predict sizes for predicted events


 ## For each iteration, simulate total burned area values -------------------
 # (note that this could be parallelized to speed things up, but a for loop
 #  may be easier to debug)

 max_iter <- 1000
 size_predictions <- list()
 pb <- txtProgressBar(max = max_iter, style = 3)
 for (i in 1:max_iter) {
  size_pred_df <- count_predictions %>%
    filter(iter == i) %>%
    left_join(test_counts, by = "test_row_index")
  
  # simulate size values
  size_matrix <- predict(size_model, 
                        newdata = size_pred_df, 
                        summary = FALSE, 
                        allow_new_levels = TRUE)
  
  # choose one posterior draw at random 
  random_draw <- sample(1:nrow(size_matrix), size = 1)
  
  # compute a prediction for total burned area
  size_pred_df$predicted_size <- size_matrix[random_draw, ]
  size_predictions[[i]] <- size_pred_df %>%
    select(iter, test_row_index, predicted_size)
  setTxtProgressBar(pb, i)
 }
 close(pb)




 # Generate a clean dataset of predictions ---------------------------------
 posterior_preds <- bind_rows(size_predictions) %>%
  left_join(test_counts) %>%
  group_by(iter, L3_KEY, year, month) %>%
  summarize(n_events_pred = length(predicted_size),
            total_burned_area_pred = sum(predicted_size), 
            maximum_fire_size_pred = max(predicted_size)) %>%
  ungroup %>%
  # fill implicit zeros (occurs when no events are predicted)
  complete(iter, L3_KEY, year, month,
           fill = list(
             n_events_pred = 0,
             total_burned_area_pred = 0, 
             maximum_fire_size_pred = 0
           )) %>%
  filter(!(year == 1984 & month == 1),  # Jan 1984 is not in MTBS
         L3_KEY %in% train_counts$L3_KEY) %>%
  mutate(first_day_of_month = as.Date(paste(year, 
                                            sprintf("%02d", month), 
                                            "01", 
                                            sep = "-")))



 # Visualize predictions ---------------------------------------------------

 # show one line per posterior iteration
 posterior_preds %>%
  ggplot(aes(first_day_of_month, total_burned_area_pred)) + 
  geom_line(aes(group = iter), alpha = .2) + 
  facet_wrap(~L3_KEY, ncol = 2) + 
  xlab("Date") + 
  ylab("Total burned area (acres)")


 # show posterior median & 95 credible interval + actual withheld data
 test_summary <-  test_sizes %>%
  mutate(first_day_of_month = as.Date(paste(year, 
                                            sprintf("%02d", month), 
                                            "01", 
                                            sep = "-"))) %>%
  group_by(L3_KEY, first_day_of_month) %>%
  summarize(total_burned_area = sum(BurnBndAc)) %>%
  ungroup

 # predictions shown as ribbons, withheld data shown as points
 posterior_preds %>%
  group_by(first_day_of_month, L3_KEY) %>%
  mutate(
    med = median(total_burned_area_pred), 
    lo = quantile(total_burned_area_pred, .025), 
    hi = quantile(total_burned_area_pred, .975)
  ) %>%
  ggplot(aes(first_day_of_month)) + 
  geom_ribbon(aes(ymin = lo, ymax = hi), 
              color = NA, alpha = .4, fill = "darkred") + 
  geom_line(aes(y = med)) + 
  facet_wrap(~L3_KEY, ncol = 2) + 
  xlab("Date") + 
  ylab("Total burned area (acres)") + 
  geom_point(data = test_summary, 
             aes(y = total_burned_area)) + 
  scale_y_log10()
	library(tidyverse)
	library(sf)
	library(here)
	library(brms)
	library(lubridate)
	library(reshape2)

	# Get ecoregion data ------------------------------------------------------
	download.file("ftp://newftp.epa.gov/EPADataCommons/ORD/Ecoregions/us/us_eco_l4.zip",
	destfile = "ecoregions.zip")
	unzip("ecoregions.zip")
	ecoregions <- st_read("us_eco_l4_no_st.shp") %>%
	st_make_valid() %>%
	# summarize to level 3 regions (assuming we don't want to go down to level 4)
	group_by(L3_KEY, L2_KEY, L1_KEY) %>%
	summarize()

	ecoregions <- ecoregions %>%
	ungroup %>%
	mutate(eco_area_sq_m = st_area(ecoregions),
	eco_area_sq_km = as.numeric(eco_area_sq_m) / 1000000)

	ecoregions %>%
	ggplot() +
	geom_sf()


	# Get MTBS data -----------------------------------------------------------
	download.file("https://edcintl.cr.usgs.gov/downloads/sciweb1/shared/MTBS_Fire/data/composite_data/fod_pt_shapefile/mtbs_fod_pts_data.zip",
	destfile = "mtbs.zip")
	unzip("mtbs.zip")
	mtbs <- st_read("mtbs_FODpoints_DD.shp") %>%
	st_transform(st_crs(ecoregions)) %>%
	filter(Incid_Type == "Wildfire",
	BurnBndAc > 1e3) %>%
	mutate(year = year(Ig_Date),
	month = month(Ig_Date))

	mtbs %>%
	ggplot() +
	geom_sf()


	# Find the ecoregions for each MTBS event ---------------------------------
	mtbs_eco <- st_intersection(mtbs, ecoregions)

	mtbs_eco %>%
	filter(grepl("GREAT PLAINS", L1_KEY))%>%
	ggplot() +
	geom_sf(aes(color = L2_KEY))



	# Generate clean count and size data sets ----------------------

	nonzero_counts <- mtbs_eco %>%
	as_tibble %>%
	count(year, month, L3_KEY)

	ecoregion_keys <- ecoregions %>%
	as_tibble %>%
	select(ends_with("KEY"), eco_area_sq_km)

	# start with a small subset of the data to reduce model run times
	focal_l1_region <- "GREAT PLAINS"
	max_year_train_set <- 2010


	all_counts <- expand.grid(year = 1984:2018,
	month = 1:12,
	L3_KEY = levels(ecoregions$L3_KEY)) %>%
	as_tibble %>%
	filter(!(year == 1984 & month == 1)) %>% # Jan 1984 is not in MTBS
	left_join(nonzero_counts) %>%
	mutate(n = ifelse(is.na(n), 0, n)) %>%
	left_join(ecoregion_keys)


	train_counts <- all_counts %>%
	filter(
	grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
	year < max_year_train_set # start with a subset of years
	)

	train_sizes <- mtbs_eco %>%
	as_tibble %>%
	filter(
	grepl(focal_l1_region, L1_KEY), # start by focusing on one ecoregion
	year < max_year_train_set # start with a subset of years
	)

	test_sizes <- mtbs_eco %>%
	as_tibble %>%
	filter(grepl(focal_l1_region, L1_KEY)) %>%
	anti_join(train_sizes)

	# Fit a count model -------------------------------------------------------

	# the smoother with bs = "cc" is a cyclic spline smoother (essentially an
	# easy way to account for seasonality, though in a real model it would be
	# better to include climate data instead)
	count_model <- brm(n ~ s(month, bs = "cc") + (1 \| L2_KEY) + (1 \| L3_KEY) +
	offset(log(eco_area_sq_km)),
	data = train_counts,
	family = zero_inflated_negbinomial(),
	cores = 4
	)

	summary(count_model)

	plot(conditional_effects(count_model), ask = FALSE)


	# Fit a fire size model ---------------------------------------------------

	size_model <- brm(BurnBndAc ~ s(month, bs = "cc") + (1 \| L2_KEY) + (1 \| L3_KEY),
	data = train_sizes,
	family = lognormal(),
	cores = 4)

	summary(size_model)

	plot(conditional_effects(size_model), ask = FALSE)



	# Combine outputs from count and size model to infer total burn area ------

	test_counts <- anti_join(all_counts, train_counts) %>%
	filter(L3_KEY %in% train_counts$L3_KEY) %>%
	mutate(test_row_index = 1:n())


	# generate a set of count predictions for each row in the test data
	count_predictions <- predict(count_model,
	summary = FALSE,
	newdata = test_counts,
	allow_new_levels = TRUE) %>%
	reshape2::melt(varnames = c("iter", "test_row_index")) %>%
	as_tibble %>%
	filter(value > 0) # we only need to predict sizes for predicted events


	## For each iteration, simulate total burned area values -------------------
	# (note that this could be parallelized to speed things up, but a for loop
	# may be easier to debug)

	max_iter <- 1000
	size_predictions <- list()
	pb <- txtProgressBar(max = max_iter, style = 3)
	for (i in 1:max_iter) {
	size_pred_df <- count_predictions %>%
	filter(iter == i) %>%
	left_join(test_counts, by = "test_row_index")

	# simulate size values
	size_matrix <- predict(size_model,
	newdata = size_pred_df,
	summary = FALSE,
	allow_new_levels = TRUE)

	# choose one posterior draw at random
	random_draw <- sample(1:nrow(size_matrix), size = 1)

	# compute a prediction for total burned area
	size_pred_df$predicted_size <- size_matrix[random_draw, ]
	size_predictions[[i]] <- size_pred_df %>%
	select(iter, test_row_index, predicted_size)
	setTxtProgressBar(pb, i)
	}
	close(pb)




	# Generate a clean dataset of predictions ---------------------------------
	posterior_preds <- bind_rows(size_predictions) %>%
	left_join(test_counts) %>%
	group_by(iter, L3_KEY, year, month) %>%
	summarize(n_events_pred = length(predicted_size),
	total_burned_area_pred = sum(predicted_size),
	maximum_fire_size_pred = max(predicted_size)) %>%
	ungroup %>%
	# fill implicit zeros (occurs when no events are predicted)
	complete(iter, L3_KEY, year, month,
	fill = list(
	n_events_pred = 0,
	total_burned_area_pred = 0,
	maximum_fire_size_pred = 0
	)) %>%
	filter(!(year == 1984 & month == 1), # Jan 1984 is not in MTBS
	L3_KEY %in% train_counts$L3_KEY) %>%
	mutate(first_day_of_month = as.Date(paste(year,
	sprintf("%02d", month),
	"01",
	sep = "-")))



	# Visualize predictions ---------------------------------------------------

	# show one line per posterior iteration
	posterior_preds %>%
	ggplot(aes(first_day_of_month, total_burned_area_pred)) +
	geom_line(aes(group = iter), alpha = .2) +
	facet_wrap(~L3_KEY, ncol = 2) +
	xlab("Date") +
	ylab("Total burned area (acres)")


	# show posterior median & 95 credible interval + actual withheld data
	test_summary <- test_sizes %>%
	mutate(first_day_of_month = as.Date(paste(year,
	sprintf("%02d", month),
	"01",
	sep = "-"))) %>%
	group_by(L3_KEY, first_day_of_month) %>%
	summarize(total_burned_area = sum(BurnBndAc)) %>%
	ungroup

	# predictions shown as ribbons, withheld data shown as points
	posterior_preds %>%
	group_by(first_day_of_month, L3_KEY) %>%
	mutate(
	med = median(total_burned_area_pred),
	lo = quantile(total_burned_area_pred, .025),
	hi = quantile(total_burned_area_pred, .975)
	) %>%
	ggplot(aes(first_day_of_month)) +
	geom_ribbon(aes(ymin = lo, ymax = hi),
	color = NA, alpha = .4, fill = "darkred") +
	geom_line(aes(y = med)) +
	facet_wrap(~L3_KEY, ncol = 2) +
	xlab("Date") +
	ylab("Total burned area (acres)") +
	geom_point(data = test_summary,
	aes(y = total_burned_area)) +
	scale_y_log10()