MarinaGolivets · July 13, 2022 06:37
diff --git a/GP_Oekologie_2022.R b/GP_Oekologie_2022.R
 # GP Oekologie 2022
 # Ingolf Kühn & Marina Golivets


 # install & load R packages ------------------------------------------------------------------------

 rm(list = ls())

 # packages that we need for analyses
 pkgs <- c(
  "here", # for handling working directory
  "googlesheets4", # for loading data from Google spreadsheets
  "tidyverse", # for data manipulation
  "magrittr", # for piping
  "skimr", # for summarizing data
  "gtsummary", # for summarizing data
  "taxize", # for checking taxonomic names
  "TR8", # for retrieving trait data from databases
  "performance", # for inspecting regression models,
  "rstatix", # for statistical analyses, e.g. ANOVA
  "DescTools", # for miscellaneous basic stats
  "ggpubr", # for plotting
  "ggrepel", # for plotting
  "viridis", # for plotting
  "cowplot", # for plotting
  "vegan", # for multivariate analyses
  "FactoMineR", # for PCA
  "ks" # for kernel smoothers
 )

 # install (if not installed) and load all the packages

 # # run only once!
 # install.packages("devtools")
 # devtools::install_github("ddsjoberg/gtsummary")
 # devtools::install_github("ropensci/skimr")

 for (p in pkgs) {
  if (!require(p, character.only = TRUE)) install.packages(p)
  library(p, character.only = TRUE)
 }

 # choose an option to not treat warnings as errors
 options(warn = 1)

 # check what packages are being used on your machine
 sessionInfo()

 # check your working (home) directory
 here()

 # load data ----------------------------------------------------------------------------------------

 if (length(list.files(here(), "veg_data_2022.csv")) == 0) {
  # load the data directly from the Google sheet
  veg <- googlesheets4::read_sheet(
    "https://docs.google.com/spreadsheets/d/1X9DM-uaaQ9MP5KnSNUTonx67BygXYH_7xd1G_vXOIqE",
    sheet = "Daten", range = "A1:S142", na = c(NULL, "-", ""), col_types = "iicccidddddddcdccdc"
  )
  # save the data to your home directory
  readr::write_csv2(veg, here("veg_data_2022.csv"))
 } else {
  # load the data from your home directory
  veg <- readr::read_csv2(here("veg_data_2022.csv"))
 }

 # take a quick look at the data
 glimpse(veg)

 # rename columns -----------------------------------------------------------------------------------

 veg %<>%
  rename(
    Fortlaufendenr = `...1`,
    Aufnahmenr = Aufnahmenr.,
    WissName = `Wissenschaftlicher Name`,
    Hoehe.avg = `Höhe Ø (cm)`,
    Hoehe.var = `Höhe σ²`,
    Blattflaeche = `Blatt-Fläche (mm²)`,
    Frischmasse = `Frischmasse (g)`,
    Trockenmasse = `Trockenmasse (mg)`,
    Germinulenmasse = `Germinulenmasse (mg)`,
    Feuchte = `Ellenberg-Feuchte`,
    Bodenfeuchte = `Bodenfeuchte (%vol)`
  )

 # take a more detailed look at the data
 skimr::skim(veg)
 select(veg, -Fortlaufendenr, -Aufnahmenr, -WissName, -Anmerkungen) %>%
  gtsummary::tbl_summary()

 # check data for errors ----------------------------------------------------------------------------

 # reassign expert water availability measures
 veg %<>% rows_update(tibble(Aufnahmenr = 1:8, WasserHH = c(1, 2, 3, 5, 6, 7, 8, 4)))

 # make sure plot-level data don't have duplicates
 select(veg, Aufnahmenr, Ort, Vegetation, WasserHH, Bodenfeuchte) %>%
  n_distinct() # should equal the number of plots (n = 8)

 # make sure species are not duplicated within plots
 nrow(veg) # 141
 select(veg, Aufnahmenr, WissName) %>%
  n_distinct() # should equal the number of rows (n = 141)

 # make sure species-level data (from databases) are unique
 select(veg, WissName) %>%
  n_distinct() # 79
 select(veg, WissName, Lebensform, Germinulenmasse, Bestaeubung, Feuchte) %>%
  n_distinct() # should equal the number of species (n = 79)

 # check life form
 table(veg$Lebensform)
 filter(veg, Lebensform == "G") %>%
  pull(WissName) %>%
  unique()
 veg %<>% mutate(Lebensform = case_when(
  WissName == "Phalaris arundinacea" ~ "mL",
  TRUE ~ Lebensform
 ))

 # check % cover per species
 hist(veg$Deckung)

 # check total % cover per plot
 group_by(veg, Aufnahmenr) %>%
  summarise(Deckung.tot = sum(Deckung))

 # check vegetation types at each site
 table(veg$Ort, veg$Vegetation)

 # check if Ellenberg humidity indicator values are integers
 unique(veg$Feuchte)
 unique(as.integer(veg$Feuchte))
 veg %<>%
  mutate(
    Feuchte = stringr::str_remove_all(Feuchte, pattern = "~|x|=") %>% # remove non-numeric symbols
      as.integer() # convert to integer
  )
 veg[veg$Fortlaufendenr == 41, "Feuchte"] <- NA

 # check taxon names
 tax <- taxize::gnr_resolve(veg$WissName, data_source_id = 1, fields = "all", best_match_only = TRUE)
 View(tax)
 veg %<>%
  mutate(
    WissName = recode(
      WissName,
      `Taraxacum officinalis` = "Taraxacum officinale",
      `Viola arvense` = "Viola arvensis"
    )
  )
 veg[veg$Fortlaufendenr == 41, "WissName"] <- "Agrostis capillaris"

 # check leaf area
 summary(veg$Blattflaeche)
 veg %<>% mutate(Blattflaeche = na_if(Blattflaeche, 0))

 # check seed mass
 summary(veg$Germinulenmasse)
 veg %>%
  filter(Germinulenmasse == .001) %>%
  pull(WissName) %>%
  unique()
 View(available_tr8)
 gmasse_tr8 <- TR8::tr8(
  species_list = c("Agrostis stolonifera", "Cerastium semidecandrum"),
  download_list = c("seed_wght", "SeedMass", "seed_mass"),
  allow_persistent = TRUE
 )
 gmasse_tr8
 veg %<>%
  mutate(Germinulenmasse = case_when(
    WissName == "Cerastium semidecandrum" ~ .04,
    WissName == "Agrostis stolonifera" ~ .07,
    Fortlaufendenr == 41 ~ .1,
    TRUE ~ Germinulenmasse
  ))

 # take a look at the data again
 skim(veg)

 # impute missing trait data ------------------------------------------------------------------------

 # life form (data from FloraWeb)
 select(veg, WissName, Lebensform) %>%
  distinct() %>%
  filter(is.na(Lebensform))
 veg %<>%
  mutate(Lebensform = case_when(
    WissName %in% c("Cerastium glomeratum", "Erophila verna") ~ "mL",
    TRUE ~ Lebensform
  ))

 # pollination type (data from FloraWeb)
 select(veg, WissName, Bestaeubung) %>%
  distinct() %>%
  filter(is.na(Bestaeubung))
 veg %<>%
  mutate(Bestaeubung = case_when(
    WissName %in% c(
      "Cerastium glomeratum", "Erophila verna", "Valerianella spec."
    ) ~ "se",
    TRUE ~ Bestaeubung
  ))

 # seed mass (data from Seed Information Database)
 select(veg, WissName, Germinulenmasse) %>%
  distinct() %>%
  filter(is.na(Germinulenmasse))
 veg %<>%
  mutate(Germinulenmasse = case_when(
    WissName == "Allium vineale" ~ 7,
    WissName == "Carex ovalis" ~ .52,
    WissName == "Cerastium glomeratum" ~ .05,
    WissName == "Cerastium glutinosum" ~ .0654,
    WissName == "Cruciata laevipes" ~ 3.5856,
    WissName == "Erodium cicutarium" ~ 2,
    WissName == "Erophila verna" ~ .024,
    WissName == "Senecio erucifolius" ~ .3935,
    WissName == "Saxifraga granulata" ~ .03,
    WissName == "Prunus mahaleb" ~ 79.9,
    WissName == "Polygonum persicaria" ~ 2.1,
    TRUE ~ Germinulenmasse
  ))
 summary(veg$Germinulenmasse)

 # convert life form and pollination to factor
 veg %<>% mutate(across(c(Lebensform, Bestaeubung), ~ as.factor(.)))

 # calculate SLA & LDMC -----------------------------------------------------------------------------

 veg %<>%
  mutate(
    SLA = Blattflaeche / Trockenmasse,
    LDMC = Trockenmasse / Frischmasse
  )

 # check SLA
 hist(veg$SLA)

 # check LDMC
 hist(veg$LDMC)
 filter(veg, LDMC > 500) %>%
  select(Fortlaufendenr, WissName, Deckung, Blattflaeche, Frischmasse, Trockenmasse, LDMC)

 # make corrections in the data
 veg %<>%
  mutate(
    Frischmasse = if_else(Fortlaufendenr == 127, .2063, Frischmasse),
    Trockenmasse = if_else(Fortlaufendenr == 79, 85, Trockenmasse) # it's a guess!
  )

 # recalculate SLA & LDMC
 veg %<>%
  mutate(
    SLA = Blattflaeche / Trockenmasse,
    LDMC = Trockenmasse / Frischmasse
  )

 # analyse species numbers --------------------------------------------------------------------------

 # check the number of species per plot
 artz1 <- table(veg$Aufnahmenr)
 artz1

 # check the number of species at each water availability level
 artz2 <- table(veg$WasserHH)
 artz2
 # test whether the observed species numbers differ from the expected
 chisq.test(artz2)
 chisq.test(artz2)$expected
 barplot(artz2,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Artenzahl"
 )

 # analyse water availability -----------------------------------------------------------------------

 # check the correspondence between the Ellenberg values and estimated water availability
 boxplot(veg$Feuchte ~ veg$WasserHH)

 # calculate the mean plot-level Ellenberg humidity indicator value
 WasserHH.Feuchte <- group_by(veg, Aufnahmenr, WasserHH, Bodenfeuchte) %>%
  summarise(
    # community mean
    Feuchte_CM = mean(Feuchte, na.rm = TRUE) %>%
      round(., 1),
    # community weighted mean
    Feuchte_CWM = weighted.mean(x = Feuchte, w = Deckung, na.rm = TRUE) %>%
      round(., 1)
  )

 # plot the plot-level Ellenberg humidity
 op <- par(mfcol = c(1, 2), mar = c(4, 4, 2, 2))
 plot(Feuchte_CWM ~ WasserHH,
  data = WasserHH.Feuchte, col = "red",
  pch = 20, ylab = "mittlere Feuchte nach Ellenberg"
 )
 points(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CM, col = "green", pch = 20)
 legend(2, 7, c("CWM", "CM"), col = c("red", "green"), pch = 20)

 plot(Feuchte_CWM ~ Bodenfeuchte, data = WasserHH.Feuchte, col = "red", pch = 20, ylab = "")
 points(WasserHH.Feuchte$Bodenfeuchte, WasserHH.Feuchte$Feuchte_CM, col = "green", pch = 20)
 legend(2, 7, c("CWM", "CM"), col = c("red", "green"), pch = 20)
 par(op)

 # calculate correlation between the water availability and plot-level Ellenberg humidity
 cor.test(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CM, method = "kendall", exact = FALSE)
 cor.test(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CWM, method = "kendall", exact = FALSE)
 cor.test(WasserHH.Feuchte$Bodenfeuchte, WasserHH.Feuchte$Feuchte_CWM,
  method = "kendall", exact = FALSE
 )

 # add the calculated mean Ellenberg humidity to the main data table
 veg %<>% left_join(WasserHH.Feuchte)
 glimpse(veg)

 # analyse LDMC -------------------------------------------------------------------------------------

 # LDMC = leaf dry matter content

 # inspect the distribution of LDMC visually
 hist(veg$LDMC)

 # plot LDMC across water availability levels
 boxplot(
  LDMC ~ WasserHH,
  dat = veg, ylab = "LDMC", xlab = "geschaetzte Wasserverfuegbarkeit (ordinal)",
  col = rgb(r = 0, g = 0, b = 1, alpha = seq(.1, .9, length.out = 7)), log = "y"
 )
 boxplot(LDMC ~ Feuchte_CWM,
  dat = veg, ylab = "LDMC", xlab = "mittlere Feuchte nach Ellenberg",
  x.axis = 1:8, col = rgb(r = 0, g = 0, b = 1, alpha = seq(.1, .9, length.out = 7)), log = "y"
 )
 ggplot(veg, aes(x = Bodenfeuchte, y = LDMC)) +
  geom_point(na.rm = TRUE) +
  stat_smooth(method = "lm", col = "#E69F00", se = FALSE, na.rm = TRUE) +
  labs(x = "Bodenfeuchte [%]") +
  theme_minimal()

 # perform regression analyses
 # using WasserHH as explanatory variable
 fm0.LDMC <- lm(LDMC ~ WasserHH, data = veg)
 summary(fm0.LDMC)
 performance::check_model(fm0.LDMC)

 fm1.LDMC <- lm(LDMC ~ WasserHH, weight = Deckung, data = veg) # weighted by % cover
 summary(fm1.LDMC)
 check_model(fm1.LDMC)

 # compare the two models
 tbl_merge(
  tbls = list(tbl_regression(fm0.LDMC), tbl_regression(fm1.LDMC)),
  tab_spanner = c("**Non-weighted**", "**Weigted**")
 )

 # using soil moisture (Bodenfeuchte) as explanatory variable
 fm0.LDMC.b <- lm(LDMC ~ Bodenfeuchte, data = veg)
 summary(fm0.LDMC.b)
 check_model(fm0.LDMC.b)

 fm1.LDMC.b <- lm(LDMC ~ Bodenfeuchte, weight = Deckung, data = veg)
 summary(fm1.LDMC.b)
 check_model(fm1.LDMC.b)

 # compare the two models
 tbl_merge(
  tbls = list(tbl_regression(fm0.LDMC.b), tbl_regression(fm1.LDMC.b)),
  tab_spanner = c("**Non-weighted**", "**Weigted**")
 )

 plot1 <- ggplot(veg, aes(x = Bodenfeuchte, y = LDMC)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm0.LDMC.b$coefficients[1], slope = fm0.LDMC.b$coefficients[2],
    col = "#E69F00", size = 1, na.rm = TRUE
  ) +
  geom_abline(
    intercept = fm1.LDMC.b$coefficients[1], slope = fm1.LDMC.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  labs(x = "Bodenfeuchte [%]") +
  theme_minimal() +
  annotate("text", x = 20, y = 270, label = "Non-weighted", col = "#E69F00") +
  annotate("text", x = 25, y = 320, label = "Weighted", col = "#56B4E9")
 plot1

 # analyse SLA --------------------------------------------------------------------------------------

 # SLA = specific leaf area

 # inspect the distribution of SLA visually
 hist(veg$SLA)
 hist(log(veg$SLA))

 # test the normality of distribution
 shapiro_test(veg$SLA)
 shapiro_test(log(veg$SLA))

 # add log-transformed values to the main data table
 veg %<>% mutate(SLA.log = log(SLA))

 # plot the distribution of SLA across water availability levels
 op <- par(mar = c(4, 5, 2, 3))
 boxplot(SLA ~ WasserHH,
  dat = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "SLA", col = "blue", log = "y"
 )
 filter(veg, WasserHH == 2 & SLA > 25) %>%
  select(WissName, Aufnahmenr, SLA)
 filter(veg, WissName == "Euphorbia cyparissias") %>%
  select(WissName, Aufnahmenr, SLA)

 boxplot(SLA.log ~ WasserHH,
  dat = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(log("SLA")), col = "blue"
 )
 par(op)
 filter(veg, WasserHH == 7 & SLA.log < 3) %>%
  select(WissName, Aufnahmenr, SLA)

 # perform regression analyses
 fm1.SLA <- lm(SLA ~ WasserHH, weight = Deckung, data = veg)
 summary(fm1.SLA)
 check_model(fm1.SLA)

 fm1.SLA.log <- lm(SLA.log ~ WasserHH, weight = Deckung, data = veg)
 summary(fm1.SLA.log)
 check_model(fm1.SLA.log)

 fm1.SLA.log.b <- lm(SLA.log ~ Bodenfeuchte, weight = Deckung, data = veg)
 summary(fm1.SLA.log.b)
 check_model(fm1.SLA.log.b)

 # plot the regression line
 plot2 <- ggplot(veg, aes(x = Bodenfeuchte, y = SLA.log)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm1.SLA.log.b$coefficients[1], slope = fm1.SLA.log.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  labs(x = "Bodenfeuchte [%]", y = expression(log("SLA"))) +
  theme_minimal()
 plot2

 # look at the relationship between SLA and LDMC
 plot(SLA ~ LDMC, data = veg)
 cor.test(veg$SLA, veg$LDMC, use = "complete.obs")

 # analyse height -----------------------------------------------------------------------------------

 # inspect distributions visually
 hist(veg$Hoehe.avg)
 hist(log(veg$Hoehe.avg))

 # test the normality of distribution
 shapiro_test(veg$Hoehe.avg)
 shapiro_test(log(veg$Hoehe.avg))

 # add log-transformed values to the main data table
 veg %<>% mutate(Hoehe.avg.log = log(Hoehe.avg))

 # plot the distribution of height across water availability levels
 op <- par(mar = c(4, 5, 2, 3))
 boxplot(Hoehe.avg ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Wuchshöhe [cm]", col = rainbow(7, start = 1, end = 5 / 7),
  log = "y"
 )

 filter(veg, WasserHH == 6 & Hoehe.avg > 50) %>%
  select(WissName, Aufnahmenr, Hoehe.avg)
 filter(veg, WasserHH == 8 & Hoehe.avg < 20) %>%
  select(WissName, Aufnahmenr, Hoehe.avg)

 boxplot(Hoehe.avg.log ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(log("Wuchshöhe [cm]")), col = rainbow(7, start = 1, end = 5 / 7)
 )
 par(op)

 # perform regression analysis
 fm1.hoehe.avg.log <- lm(Hoehe.avg.log ~ WasserHH, weight = Deckung, data = veg)
 summary(fm1.hoehe.avg.log)
 check_model(fm1.hoehe.avg.log)

 fm1.hoehe.avg.log.b <- lm(Hoehe.avg.log ~ Bodenfeuchte, weight = Deckung, data = veg)
 summary(fm1.hoehe.avg.log.b)
 check_model(fm1.hoehe.avg.log.b)

 # plot the regression line
 plot3 <- ggplot(veg, aes(x = Bodenfeuchte, y = Hoehe.avg.log)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm1.hoehe.avg.log.b$coefficients[1],
    slope = fm1.hoehe.avg.log.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  geom_smooth(col = "red", se = FALSE, na.rm = TRUE) +
  labs(x = "Bodenfeuchte [%]", y = expression(log("Wuchshöhe [cm]"))) +
  theme_minimal()
 plot3

 # analyse height variance --------------------------------------------------------------------------

 # plot height variance
 hist(veg$Hoehe.var)
 hist(log(veg$Hoehe.var))

 # add log-transformed values to the main data table
 veg %<>% mutate(Hoehe.var.log = log(Hoehe.var))

 # check the relationship between height and height variance
 boxplot(Hoehe.var.log ~ WasserHH, data = veg)
 plot(Hoehe.var.log ~ Hoehe.avg, data = veg)
 cor.test(veg$Hoehe.var.log, veg$Hoehe.avg)

 # calculate the coefficient of variation
 veg %<>% mutate(Hoehe.cv = sqrt(Hoehe.var) / Hoehe.avg * 100)

 hist(veg$Hoehe.cv)
 hist(log(veg$Hoehe.cv))

 veg %<>% mutate(Hoehe.cv.log = log(Hoehe.cv))

 plot(Hoehe.cv.log ~ Hoehe.avg, data = veg)
 cor.test(veg$Hoehe.cv.log, veg$Hoehe.avg)

 boxplot(Hoehe.cv ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Variationskoeffizient der Wuchshöhe", log = "y", col = topo.colors(7)[7:1]
 )
 filter(veg, WasserHH == 4 & Hoehe.cv > 100) %>%
  select(WissName, Aufnahmenr, Hoehe.avg, Hoehe.var, Hoehe.cv)

 # perform regression analysis
 fm1.Hoehe.cv.log <- lm(Hoehe.cv.log ~ WasserHH, data = veg, weight = Deckung)
 summary(fm1.Hoehe.cv.log)
 check_model(fm1.Hoehe.cv.log)

 fm1.Hoehe.cv.log.b <- lm(Hoehe.cv.log ~ Bodenfeuchte, data = veg, weight = Deckung)
 summary(fm1.Hoehe.cv.log.b)
 check_model(fm1.Hoehe.cv.log.b)

 # analyse seed mass --------------------------------------------------------------------------------

 hist(veg$Germinulenmasse)
 hist(log(veg$Germinulenmasse))

 veg %<>% mutate(Germinulenmasse.log = log(Germinulenmasse))

 op <- par(mar = c(4, 5, 2, 3))
 boxplot(Germinulenmasse ~ WasserHH, data = veg, log = "y")
 boxplot(Germinulenmasse.log ~ WasserHH, data = veg, ylab = expression(log("Germinulenmasse")))
 filter(veg, Germinulenmasse.log > 4) %>%
  select(WissName, Germinulenmasse)
 par(op)

 # perform regression analysis
 fm1.gmasse.log <- lm(Germinulenmasse.log ~ WasserHH, data = veg, weight = Deckung)
 summary(fm1.gmasse.log)
 check_model(fm1.gmasse.log)

 fm1.gmasse.log.b <- lm(Germinulenmasse.log ~ Bodenfeuchte, data = veg, weight = Deckung)
 summary(fm1.gmasse.log.b)
 check_model(fm1.gmasse.log.b)

 # analyse life form --------------------------------------------------------------------------------

 # compare the distributions of life forms across vegetation types
 select(veg, Lebensform, Vegetation) %>%
  gtsummary::tbl_summary(by = Vegetation) %>%
  add_p()

 # calculate total % cover per life form across humidity levels
 lft.1 <- group_by(veg, Aufnahmenr, WasserHH, Lebensform) %>%
  summarise(Deckung = sum(Deckung))
 head(lft.1)
 tail(lft.1)

 lft.2 <- xtabs(Deckung ~ Lebensform + WasserHH, data = lft.1)
 lft.2 <- DescTools::as.matrix.xtabs(lft.3)
 lft.2
 chisq_test(lft.2, simulate.p.value = TRUE)

 # plot the distribution of life forms across humidity levels
 # using base R
 op <- par(mar = c(4, 5, 2, 3))
 barplot(lft.3,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(sum("Deckung")),
  main = "Lebensformenspektren", ylim = c(0, 160)
 )
 legend(8.5, 150, legend = row.names(lft.3), pch = 15, col = grey.colors(ncol(lft.3)))
 par(op)

 # using ggplot2
 ggplot(data = lft.1, aes(fill = Lebensform, y = Deckung, x = ordered(WasserHH))) +
  geom_bar(position = "stack", stat = "identity") +
  viridis::scale_fill_viridis(discrete = TRUE) +
  labs(x = "geschätzte Wasserverfügbarkeit (ordinal)", y = expression(sum("Deckung"))) +
  ggtitle("Lebensformenspektren") +
  theme_minimal()

 # analyse pollination type -------------------------------------------------------------------------

 # compare the distributions of pollination syndromes across vegetation types
 select(veg, Bestaeubung, Vegetation) %>%
  gtsummary::tbl_summary(by = Vegetation) %>%
  add_p()

 # calculate total % cover per pollination type across humidity levels
 bst.1 <- group_by(veg, Aufnahmenr, WasserHH, Bestaeubung) %>%
  summarise(Deckung = sum(Deckung))
 head(bst.1)
 tail(bst.1)

 bst.2 <- xtabs(Deckung ~ Bestaeubung + WasserHH, data = bst.1)
 bst.2 <- DescTools::as.matrix.xtabs(bst.2)
 bst.2
 chisq_test(bst.2, simulate.p.value = TRUE)

 # plot the distribution of pollination types
 # using base R
 op <- par(mar = c(4, 5, 2, 3))
 cols <- c("yellow", "brown", "grey", "lightblue")
 barplot(bst.3,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)", ylab = expression(sum("Deckung")),
  main = "Bestäubungstypenspektren",
  sub = expression(Chi^2 * "= 243, p < 0.001"), cex.sub = 0.8, col = cols
 )
 legend(0, 140,
  legend = c("Wind", "Selbst", "multiple", "Insekten"),
  pch = 15, col = cols[4:1], cex = 0.7
 )

 # using ggplot2
 plot4 <- ggplot(data = bst.1, aes(fill = Bestaeubung, y = Deckung, x = ordered(WasserHH))) +
  geom_bar(position = "stack", stat = "identity") +
  viridis::scale_fill_viridis(
    discrete = TRUE, labels = c("Insekten", "multiple", "Selbst", "Wind")
  ) +
  labs(x = "geschätzte Wasserverfügbarkeit (ordinal)", y = expression(sum("Deckung"))) +
  ggtitle("Bestäubungstypenspektren") +
  theme_minimal()
 plot4

 # analyse the relationships between traits ---------------------------------------------------------

 # test if height varies across life forms
 # plot height by life form
 ggpubr::ggboxplot(veg, x = "Lebensform", y = "Hoehe.avg.log", select = c("H", "G", "T", "mL"))

 # check the ANOVA assumptions
 # outliers
 group_by(veg, Lebensform) %>%
  identify_outliers(Hoehe.avg.log)
 # normality
 lm.hoe.lf <- lm(Hoehe.avg.log ~ Lebensform, data = veg)
 ggpubr::ggqqplot(residuals(lm.hoe.lf))
 rstatix::shapiro_test(residuals(lm.hoe.lf))
 # homogeneity of variances
 levene_test(veg, Hoehe.avg.log ~ Lebensform)

 # perform ANOVA and the Kruskal-Wallis test (non-parametric alternative)
 anova_test(veg, Hoehe.avg.log ~ Lebensform)
 # because the normality of residuals assumption was violated
 # the Kruskal Wallis test should be used instead of a standard ANOVA
 kruskal_test(veg, Hoehe.avg.log ~ Lebensform)


 # test if the height varies across pollination types
 veg$Bestaeubung <- as.factor(veg$Bestaeubung)
 ggboxplot(veg, x = "Bestaeubung", y = "Hoehe.avg.log")
 group_by(veg, Bestaeubung) %>%
  identify_outliers(Hoehe.avg.log) %>%
  select(Fortlaufendenr, WissName, Hoehe.avg, Hoehe.avg.log, is.outlier, is.extreme)
 lm.hoe.best <- lm(Hoehe.avg.log ~ Bestaeubung, data = veg)
 ggqqplot(residuals(lm.hoe.best))
 shapiro_test(residuals(lm.hoe.best))
 levene_test(veg, Hoehe.avg.log ~ Bestaeubung)

 # because the normality of residuals assumption was violated
 # we use the Kruskal Wallis test instead of a standard ANOVA
 # and the Dunn's post hoc test for pairwise comparisons
 hoe.best.kw <- kruskal_test(veg, Hoehe.avg.log ~ Bestaeubung)
 hoe.best.kw
 hoe.best.pwc <- dunn_test(veg, Hoehe.avg.log ~ Bestaeubung)
 hoe.best.pwc

 # plot the results
 hoe.best.pwc %<>% add_xy_position(x = "Bestaeubung")
 plot6 <- ggboxplot(veg, x = "Bestaeubung", y = "Germinulenmasse.log") +
  stat_pvalue_manual(hoe.best.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(hoe.best.kw, detailed = TRUE),
    caption = get_pwc_label(hoe.best.pwc),
    y = expression(log("Wuchshöhe")),
    x = "Bestäubungstyp"
  )
 plot6

 # test if the SLA varies across life forms
 ggboxplot(veg, x = "Lebensform", y = "SLA.log", select = c("H", "G", "T", "mL"))
 group_by(veg, Lebensform) %>%
  identify_outliers(SLA.log)
 lm.sla.lf <- lm(SLA.log ~ Lebensform, data = veg)
 ggqqplot(residuals(lm.sla.lf))
 shapiro_test(residuals(lm.sla.lf))
 levene_test(veg, SLA.log ~ Lebensform)
 anova_test(veg, SLA.log ~ Lebensform)


 # test if the seed mass varies across life forms
 gmasse <- select(veg, WissName, Germinulenmasse.log, Lebensform, Bestaeubung) %>%
  distinct()
 ggboxplot(gmasse, x = "Lebensform", y = "Germinulenmasse.log")
 group_by(gmasse, Lebensform) %>%
  identify_outliers(Germinulenmasse.log)
 lm.gmasse.lf <- lm(Germinulenmasse.log ~ Lebensform, data = gmasse)
 ggqqplot(residuals(lm.gmasse.lf))
 shapiro_test(residuals(lm.gmasse.lf))
 levene_test(gmasse, Germinulenmasse.log ~ Lebensform)
 gmasse.lf.aov <- anova_test(gmasse, Germinulenmasse.log ~ Lebensform)
 gmasse.lf.aov
 gmasse.lf.pwc <- tukey_hsd(gmasse, Germinulenmasse.log ~ Lebensform)
 gmasse.lf.pwc

 gmasse.lf.pwc %<>% add_xy_position(x = "Lebensform")
 plot7 <- ggboxplot(gmasse, x = "Lebensform", y = "Germinulenmasse.log") +
  stat_pvalue_manual(gmasse.lf.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(gmasse.lf.aov, detailed = TRUE),
    caption = get_pwc_label(gmasse.lf.pwc),
    y = expression(log("Germinulenmasse")),
    x = "Lebensform"
  )
 plot7


 # test if the seed mass varies across pollination types
 ggboxplot(gmasse, x = "Bestaeubung", y = "Germinulenmasse.log")
 group_by(gmasse, Bestaeubung) %>%
  identify_outliers(Germinulenmasse.log)
 fm.gmasse.best <- lm(Germinulenmasse.log ~ Bestaeubung, data = gmasse)
 ggpubr::ggqqplot(residuals(fm.gmasse.best))
 rstatix::shapiro_test(residuals(fm.gmasse.best))
 ggqqplot(gmasse, "Germinulenmasse.log", facet.by = "Bestaeubung")
 levene_test(gmasse, Germinulenmasse.log ~ Bestaeubung)

 # because the homogeneity of variance assumption was violated
 # we use the Welch test instead of a standard ANOVA
 # and the Games-Howell post hoc test for pairwise comparisons
 gmasse.best.aov <- welch_anova_test(gmasse, Germinulenmasse.log ~ Bestaeubung)
 gmasse.best.aov
 gmasse.best.pwc <- games_howell_test(gmasse, Germinulenmasse.log ~ Bestaeubung)
 gmasse.best.pwc

 gmasse.best.pwc %<>% add_xy_position(x = "Bestaeubung")
 ggboxplot(gmasse, x = "Bestaeubung", y = "Germinulenmasse.log") +
  stat_pvalue_manual(gmasse.best.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(gmasse.best.aov, detailed = TRUE),
    caption = get_pwc_label(gmasse.best.pwc),
    y = expression(log("Germinulenmasse")),
    x = "Bestäubungstyp"
  )

 # save the edited data as a separate file
 write_csv2(veg, here("veg_data_2020_edited.csv"))

 # analyse species composition ----------------------------------------------------------------------

 veg <- read_csv2(here("veg_data_2022_edited.csv"))

 mat <- xtabs(Deckung ~ Aufnahmenr + WissName, data = veg)
 str(mat)

 # perform correspondence analysis
 decorana(mat)

 fm.ca <- cca(mat)
 fm.ca
 summary(fm.ca)
 plot(fm.ca, display = "sites")
 plot(fm.ca)

 # shorten taxon names
 wname <- str_remove(veg$WissName, pattern = " cf\\.| x") %>%
  str_split(pattern = " ", simplify = TRUE) %>%
  substr(., 1, 3)
 wname <- paste(wname[, 1], wname[, 2], sep = ".")
 veg %<>% mutate(wname = wname)

 # repeat the same analysis with shortened taxon names provided
 mat <- xtabs(Deckung ~ Aufnahmenr + wname, data = veg)
 fm.ca <- cca(mat)
 summary(fm.ca)

 # plot the results
 summary(fm.ca)$species %>%
  as_tibble() %>%
  mutate(wname = rownames(summary(fm.ca)$species)) %>%
  ggplot(aes(x = CA1, y = CA2, label = wname)) +
  geom_point() +
  ggrepel::geom_text_repel(size = 1.5, force = 15, max.overlaps = 30) +
  theme_bw()

 # prepare data for trait ordination ----------------------------------------------------------------

 # calculate the proportion of each life form per plot
 lf.kt <- select(veg, Aufnahmenr, Deckung, Lebensform) %>%
  pivot_wider(
    names_from = Lebensform, values_from = Deckung,
    values_fn = sum, values_fill = 0, names_prefix = "lf."
  ) %>%
  mutate(total = rowSums(.[, -1])) %>%
  mutate(across(lf.H:lf.M, ~ . / total), .keep = "unused")

 # calculate the proportion of each pollination type per plot
 best.kt <- select(veg, Aufnahmenr, Deckung, Bestaeubung) %>%
  pivot_wider(
    names_from = Bestaeubung, values_from = Deckung,
    values_fn = sum, values_fill = 0, names_prefix = "b."
  ) %>%
  mutate(total = rowSums(.[, -1])) %>%
  mutate(across(b.wi:b.mb, ~ . / total), .keep = "unused")

 # compute community weighted means for numerical traits
 cwm <- group_by(veg, Aufnahmenr) %>%
  summarise(across(
    c(Hoehe.avg.log, Hoehe.cv.log, LDMC, SLA.log, Germinulenmasse.log),
    ~ weighted.mean(., w = Deckung, na.rm = TRUE)
  )) %>%
  rename(
    Ho.av = Hoehe.avg.log,
    Ho.cv = Hoehe.cv.log,
    SLA = SLA.log,
    GM = Germinulenmasse.log
  )

 # combine the three tables
 cwm %<>%
  left_join(lf.kt) %>%
  left_join(best.kt)
 cwm

 # perform principal components analysis (PCA) of CWMs ----------------------------------------------

 # analyse all traits
 pca1 <- FactoMineR::PCA(cwm[, -c(1:2)])
 summary(pca1)

 cowplot::plot_grid(
  plot.PCA(pca1),
  plot.PCA(pca1, c(1, 3))
 )

 # analyse numerical traits only
 colnames(cwm)
 pca2 <- PCA(select(cwm, Ho.av:GM))
 cowplot::plot_grid(
  plot.PCA(pca2),
  plot.PCA(pca2, c(1, 3))
 )

 # perform PCA of species traits --------------------------------------------------------------------

 # create a separate table containing numerical traits
 dat <- select(
  veg, WissName, Vegetation, Blattflaeche, Frischmasse, Trockenmasse,
  SLA, LDMC, Hoehe.avg, Hoehe.var, Germinulenmasse
 ) %>%
  rename(
    BlF = Blattflaeche,
    FrM = Frischmasse,
    TrM = Trockenmasse,
    Ho.av = Hoehe.avg,
    Ho.var = Hoehe.var,
    GM = Germinulenmasse
  ) %>%
  na.omit()
 glimpse(dat)

 # perform PCA
 pca3 <- FactoMineR::PCA(dat[, -c(1:2)])
 summary(pca3)
 plot.PCA(pca3)

 # add PC to the trait table
 dat$pc1 <- pca3$ind$coord[, 1]
 dat$pc2 <- pca3$ind$coord[, 2]
 pc12 <- pca3$ind$coord[, 1:2]

 # plot the PCA results
 ggplot(data = dat, aes(x = pc1, y = pc2, color = Vegetation, shape = Vegetation)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point(alpha = .8) +
  stat_ellipse(
    geom = "polygon", aes(fill = Vegetation), alpha = .2,
    show.legend = FALSE, level = .95
  ) +
  guides(
    color = guide_legend(title = "Vegetation"),
    shape = guide_legend(title = "Vegetation")
  ) +
  labs(x = "PC 1 (45.24%)", y = "PC 2 (20.27%)") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    panel.border = element_rect(fill = "transparent"),
    legend.position = "bottom"
  )

 # plot PCA results using kernels
 # Code from Björn Reu
 # Used to produce Fig. 2 in Diaz et al. 2016, Nature

 H <- Hpi(x = pc12) # optimal bandwidth estimation
 est <- kde(x = pc12, H = H, compute.cont = TRUE) # kernel density estimation

 # set contour probabilities for drawing contour levels
 cl <- contourLevels(est, prob = c(.5, .05, .001), approx = TRUE)

 # use envfit for drawing arrows, can be also done using trait loadings
 fit <- envfit(pc12, select(dat, BlF:GM))

 # create a plot
 par(mar = c(4, 4, 2, 2))
 cols <- ifelse(dat$Vegetation == "TR", "darkred", "darkblue")
 plot(est,
  cont = seq(1, 100, by = 1), display = "filled.contour2", add = FALSE, ylab = "", xlab = "",
  cex.axis = .75, ylim = c(-5, 4), xlim = c(-3, 7), las = 1
 )
 plot(est, abs.cont = cl[1], labels = .5, labcex = .75, add = TRUE, lwd = .75, col = "grey30")
 plot(est, abs.cont = cl[2], labels = .95, labcex = .75, add = TRUE, lwd = .5, col = "grey60")
 plot(est, abs.cont = cl[3], labels = .99, labcex = .75, add = TRUE, lwd = .5, col = "grey60")
 points(pc12, pch = 16, cex = .5, col = cols)
 plot(fit, cex = .90, col = 1)
 mtext("PC 1 (45.24%)", cex = .75, side = 1, line = 1.8)
 mtext("PC 2 (20.27%)", cex = .75, side = 2, line = 1.8)

 dat[dat$pc1 > 4, ]
 dat[dat$pc2 < -3, ]