Rekyt · November 19, 2024 21:20 · Rekyt · Jul 23, 2021 · sunxi822 · Jul 23, 2021
diff --git a/arrow_pcoa.Rmd b/arrow_pcoa.Rmd
 ---
 title: "Adding arrows to PCoA"
 author: "Matthias Grenié"
 date: "26 septembre 2018"
 output: pdf_document
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```

 The goal of this document is to show how to add arrows with variables on a PCoA.

 ## Load Data

 We'll use `ade4` data, `ggplot2` for plotting and `ape::pcoa()` to compute the actual pcoa. We'll use `woangers` dataset included in `ade4` because it mixes variable types.

 First let's load packages and data
 ```{r load_pkg_data}
 library("ade4")
 library("ggplot2")
 library("ape")

 data("woangers")
 ```

 Then compute functional dissimilarity using `ade4::dist.ktab()` (this chunk is copy-pasted from the example provided in the help of `dist.ktab()`). Beware this dataset contains `NA` to avoid further problems we select only species with no NA:
 ```{r compute_dist}
 traits = woangers$traits[complete.cases(woangers$traits),]


 # Nominal variables 'li', 'pr', 'lp' and 'le'
 # (see table 1 in the main text for the codes of the variables)
 tabN = traits[, c(1:2, 7, 8)]

 # Circular variable 'fo'
 tabC  = traits[3]
 tabCp = prep.circular(tabC, 1, 12)

 # The levels of the variable lie between 1 (January) and 12 (December).
 # Ordinal variables 'he', 'ae' and 'un'
 tabO = traits[, 4:6]

 # Fuzzy variables 'mp', 'pe' and 'di'
 # 'mp' has 3 levels, 'pe' has 3 levels and 'di' has 5 levels.
 tabF  = traits[, 9:19]
 tabFp = prep.fuzzy(tabF, c(3, 3, 5), labels = c("mp", "pe", "di"))

 # Quantitative variables 'lo' and 'lf'
 tabQ     = traits[, 20:21]

 # Compute dissimilarity
 ktab1    = ktab.list.df(list(tabN, tabCp, tabO, tabFp, tabQ))
 distrait = dist.ktab(ktab1, c("N", "C", "O", "F", "Q"))
 str(distrait)
 ```

 ## Compute PCoA

 Now that we have a distance we can compute the PCoA using `pcoa()` in the `ape` package, other functions exist (`ade4::dudi.pco()`, `vegan::cmdscale()`, etc.) but I use this one for the sake of simplicity:
 ```{r compute_pcoa}
 trait_pcoa = pcoa(distrait)
 str(trait_pcoa)
 ```

 ## Add arrows

 To add the arrows we use a function (considering only numeric traits and converting ordinal traits as numeric) inspired from `ape::biplot.pcoa()`, which formula is taken from Legendre & Legendre (1998):
 ```{r compute_arrows}
 compute_arrows = function(given_pcoa, trait_df) {
    
    # Keep only quantitative or ordinal variables
    # /!\ Change this line for different dataset
    #     or select only quantitative/ordinal var. /!\
    trait_df = trait_df[, c(4:6, 20, 21)]
    
    n <- nrow(trait_df)
    points.stand <- scale(given_pcoa$vectors)
    
    # Compute covariance of variables with all axes
    S <- cov(trait_df, points.stand)
    
    # Select only positive eigenvalues
    pos_eigen = given_pcoa$values$Eigenvalues[seq(ncol(S))]
    
    # Standardize value of covariance (see Legendre & Legendre 1998)
    U <- S %*% diag((pos_eigen/(n - 1))^(-0.5))
    colnames(U) <- colnames(given_pcoa$vectors)
    
    # Add values of covariances inside object
    given_pcoa$U <- U
    
    return(given_pcoa)
 }

 trait_pcoa_arrows = compute_arrows(trait_pcoa, traits)

 trait_pcoa_arrows$U[, 1:3]
 ```

 Now for each quantitative or ordinal variable we have a covariance value with the PCoA axis.

 ## Plotting the PCoA

 We can now plot the PCoA with arrows using `ggplot2`. For `ape::pcoa()` objects the values for the individuals are stored in `pcoa$vectors`, combined with our newly computed `pcoa$U` values we should be able to get a full graph:

 ```{r plotting_pcoa}
 # /!\ Be sure to change $vectors to a data.frame before putting it in ggplot2 /!\
 trait_pcoa_arrows$vectors = as.data.frame(trait_pcoa_arrows$vectors)

 # Basic plot with individuals
 plot_pcoa = ggplot(trait_pcoa_arrows$vectors, aes(Axis.1, Axis.2)) +
    geom_point()

 plot_pcoa

 # Now let's add the arrows
 # Each arrow begins at the origin of the plot (x = 0, y = 0) and ends at the
 # values of covariances of each variable
 plot_pcoa +
    geom_segment(data = as.data.frame(trait_pcoa_arrows$U),
                 x = 0, y = 0, alpha = 0.7,
                 mapping = aes(xend = Axis.1, yend = Axis.2),
                 # Add arrow head
                 arrow = arrow(length = unit(3, "mm")))
 ```
 Sometimes the scaling for arrows does not show well with the points. We can scale arbitrarily arrows just to show plot if we precise it in the legend:
 ```{r plot_pcoa_scaling}
 plot_pcoa +
    geom_segment(data = as.data.frame(trait_pcoa_arrows$U/20),
                 x = 0, y = 0, alpha = 0.7,
                 mapping = aes(xend = Axis.1, yend = Axis.2),
                 # Add arrow head
                 arrow = arrow(length = unit(3, "mm"))) +
    labs(subtitle = "Arrows scale arbitrarily")
 ```

 If we want to know the variable names we can slightly change the arrows data.frame:
 ```{r arrows_df}
 arrows_df = as.data.frame(trait_pcoa_arrows$U/20)
 arrows_df$variable = rownames(arrows_df)

 plot_pcoa +
    geom_segment(data = arrows_df,
                 x = 0, y = 0, alpha = 0.7,
                 mapping = aes(xend = Axis.1, yend = Axis.2),
                 # Add arrow head
                 arrow = arrow(length = unit(3, "mm"))) +
    geom_label(data = arrows_df, aes(label = variable)) +
    labs(subtitle = "Arrows scale arbitrarily")
 ```
 We have the name of the variables but we can even improve using the `ggrepel` package to avoid overlapping labels:
 ```{r plot_pcoa_label_repel}
 plot_pcoa +
    geom_segment(data = arrows_df,
                 x = 0, y = 0, alpha = 0.7,
                 mapping = aes(xend = Axis.1, yend = Axis.2),
                 # Add arrow head
                 arrow = arrow(length = unit(3, "mm"))) +
    ggrepel::geom_label_repel(data = arrows_df, aes(label = variable)) +
    labs(subtitle = "Arrows scale arbitrarily")
 ```

 From this we can improve the aspect of the PCoA:
 ```{r plot_pcoa_aspect}
 # Add better axes names
 arrows_df$var_name = c("Max. Height",
                       "Aerial Veg. Mult.",
                       "Underground Veg. Mult.",
                       "Seed Bank Long.",
                       "Flowering Period")

 plot_pcoa +
    geom_segment(data = as.data.frame(trait_pcoa_arrows$U/20),
                 x = 0, y = 0, alpha = 0.7,
                 mapping = aes(xend = Axis.1, yend = Axis.2),
                 # Add arrow head
                 arrow = arrow(length = unit(3, "mm"))) +
    # Add Axes Labels
    ggrepel::geom_label_repel(data = arrows_df, aes(label = var_name)) +
    # Add axes
    geom_hline(yintercept = 0, linetype = 2) +
    geom_vline(xintercept = 0, linetype = 2) +
    # Keep coord equal for each axis
    coord_equal() +
    # Labs
    labs(subtitle = "Arrows scale arbitrarily",
         # Add Explained Variance per axis
         x = paste0("Axis 1 (", round(trait_pcoa_arrows$values$Relative_eig[1] * 100, 2), "%)"),
         y = paste0("Axis 2 (", round(trait_pcoa_arrows$values$Relative_eig[2] * 100, 2), "%)")) +
    # Theme change
    theme_bw()
 ```


 ## Wrapping up in a single function

 Now we can try to automate as many task as possible using the a single function:

 ```{r final_function}
 # axis variables given the axes to show
 # the scalar helps for the visualization of the different axes
 show_final_pcoa = function(pcoa_object_with_arrows, axis1 = 1, axis2 = 2,
                           variable_scalar = 20) {
    
    given_pcoa = pcoa_object_with_arrows
    
    # Store axes names
    kept_axes = paste0("Axis.", c(axis1, axis2))
    
    # data.frame with values from species
    pcoa_df = as.data.frame(given_pcoa$vectors)
    
    # data.frame with variable covariances
    arrow_df = as.data.frame(given_pcoa$U/variable_scalar)
    arrow_df$variable = rownames(arrow_df) # Add variable names
    
    # Eigen values per axes
    eigen_values = given_pcoa$values$Relative_eig[c(axis1, axis2)]
    names(eigen_values) = kept_axes
    
    # Axes labels in plot
    axes_labs = lapply(kept_axes, function(x) {
        paste0(gsub(".", " ", x, fixed = TRUE),
               " (", round(eigen_values[x] * 100, 2), "%)")
    })
    
    # Compute plot limits to get symetrical limits on each axis (= square plot)
    limits = apply(given_pcoa$vectors[, kept_axes], 2, range)
    
    x_lim = c(limits[1, 1] - diff(limits[,1])/10,
              limits[2, 1] + diff(limits[,1])/10)
    
    y_lim = c(limits[1, 2] - diff(limits[,2])/10,
              limits[2, 2] + diff(limits[,2])/10)
    
    
    
    # Final plot
    ggplot(pcoa_df, aes_string(x = kept_axes[1],
                               y = kept_axes[2])) +
        # Add dashed axes
        geom_hline(yintercept = 0, linetype = 2) +
        geom_vline(xintercept = 0, linetype = 2) +
        # Add species points
        geom_point() +
        # Add arrows
        geom_segment(data = arrow_df, x = 0, y = 0, alpha = 0.7,
                     arrow = arrow(length = unit(3, "mm")),
                     aes_string(xend = kept_axes[1], yend = kept_axes[2])) +
        # Add arrow labels
        ggrepel::geom_label_repel(data = arrow_df, aes(label = variable),
                                  size = 5, fontface = "bold",
                                  lineheight = 0.7) +
        # Specified limits to get square plot
        # to see the effect of this you can replace this line by
        # coord_equal() +
        coord_fixed(xlim = x_lim, ylim = y_lim) +
        # Labels & Theme
        labs(x = axes_labs[[1]],
             y = axes_labs[[2]]) +
        theme_bw() +
        theme(panel.grid = element_blank())
 }
 ```

 We can now test our function:
 ```{r test_function}
 basic_arrow_pcoa = compute_arrows(trait_pcoa, traits)

 show_final_pcoa(basic_arrow_pcoa, 1, 2)
 show_final_pcoa(basic_arrow_pcoa, 2, 3)
 show_final_pcoa(basic_arrow_pcoa, 1, 3)
 ```

 It works!
 Feel free to reuse and change the process/function ;)

 ## References

 Legendre, P. and Legendre, L. (1998) Numerical Ecology, 2nd English edition. Amsterdam: Elsevier Science BV.
	---
	title: "Adding arrows to PCoA"
	author: "Matthias Grenié"
	date: "26 septembre 2018"
	output: pdf_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```

	The goal of this document is to show how to add arrows with variables on a PCoA.

	## Load Data

	We'll use `ade4` data, `ggplot2` for plotting and `ape::pcoa()` to compute the actual pcoa. We'll use `woangers` dataset included in `ade4` because it mixes variable types.

	First let's load packages and data
	```{r load_pkg_data}
	library("ade4")
	library("ggplot2")
	library("ape")

	data("woangers")
	```

	Then compute functional dissimilarity using `ade4::dist.ktab()` (this chunk is copy-pasted from the example provided in the help of `dist.ktab()`). Beware this dataset contains `NA` to avoid further problems we select only species with no NA:
	```{r compute_dist}
	traits = woangers$traits[complete.cases(woangers$traits),]


	# Nominal variables 'li', 'pr', 'lp' and 'le'
	# (see table 1 in the main text for the codes of the variables)
	tabN = traits[, c(1:2, 7, 8)]

	# Circular variable 'fo'
	tabC = traits[3]
	tabCp = prep.circular(tabC, 1, 12)

	# The levels of the variable lie between 1 (January) and 12 (December).
	# Ordinal variables 'he', 'ae' and 'un'
	tabO = traits[, 4:6]

	# Fuzzy variables 'mp', 'pe' and 'di'
	# 'mp' has 3 levels, 'pe' has 3 levels and 'di' has 5 levels.
	tabF = traits[, 9:19]
	tabFp = prep.fuzzy(tabF, c(3, 3, 5), labels = c("mp", "pe", "di"))

	# Quantitative variables 'lo' and 'lf'
	tabQ = traits[, 20:21]

	# Compute dissimilarity
	ktab1 = ktab.list.df(list(tabN, tabCp, tabO, tabFp, tabQ))
	distrait = dist.ktab(ktab1, c("N", "C", "O", "F", "Q"))
	str(distrait)
	```

	## Compute PCoA

	Now that we have a distance we can compute the PCoA using `pcoa()` in the `ape` package, other functions exist (`ade4::dudi.pco()`, `vegan::cmdscale()`, etc.) but I use this one for the sake of simplicity:
	```{r compute_pcoa}
	trait_pcoa = pcoa(distrait)
	str(trait_pcoa)
	```

	## Add arrows

	To add the arrows we use a function (considering only numeric traits and converting ordinal traits as numeric) inspired from `ape::biplot.pcoa()`, which formula is taken from Legendre & Legendre (1998):
	```{r compute_arrows}
	compute_arrows = function(given_pcoa, trait_df) {

	# Keep only quantitative or ordinal variables
	# /!\ Change this line for different dataset
	# or select only quantitative/ordinal var. /!\
	trait_df = trait_df[, c(4:6, 20, 21)]

	n <- nrow(trait_df)
	points.stand <- scale(given_pcoa$vectors)

	# Compute covariance of variables with all axes
	S <- cov(trait_df, points.stand)

	# Select only positive eigenvalues
	pos_eigen = given_pcoa$values$Eigenvalues[seq(ncol(S))]

	# Standardize value of covariance (see Legendre & Legendre 1998)
	U <- S %*% diag((pos_eigen/(n - 1))^(-0.5))
	colnames(U) <- colnames(given_pcoa$vectors)

	# Add values of covariances inside object
	given_pcoa$U <- U

	return(given_pcoa)
	}

	trait_pcoa_arrows = compute_arrows(trait_pcoa, traits)

	trait_pcoa_arrows$U[, 1:3]
	```

	Now for each quantitative or ordinal variable we have a covariance value with the PCoA axis.

	## Plotting the PCoA

	We can now plot the PCoA with arrows using `ggplot2`. For `ape::pcoa()` objects the values for the individuals are stored in `pcoa$vectors`, combined with our newly computed `pcoa$U` values we should be able to get a full graph:

	```{r plotting_pcoa}
	# /!\ Be sure to change $vectors to a data.frame before putting it in ggplot2 /!\
	trait_pcoa_arrows$vectors = as.data.frame(trait_pcoa_arrows$vectors)

	# Basic plot with individuals
	plot_pcoa = ggplot(trait_pcoa_arrows$vectors, aes(Axis.1, Axis.2)) +
	geom_point()

	plot_pcoa

	# Now let's add the arrows
	# Each arrow begins at the origin of the plot (x = 0, y = 0) and ends at the
	# values of covariances of each variable
	plot_pcoa +
	geom_segment(data = as.data.frame(trait_pcoa_arrows$U),
	x = 0, y = 0, alpha = 0.7,
	mapping = aes(xend = Axis.1, yend = Axis.2),
	# Add arrow head
	arrow = arrow(length = unit(3, "mm")))
	```
	Sometimes the scaling for arrows does not show well with the points. We can scale arbitrarily arrows just to show plot if we precise it in the legend:
	```{r plot_pcoa_scaling}
	plot_pcoa +
	geom_segment(data = as.data.frame(trait_pcoa_arrows$U/20),
	x = 0, y = 0, alpha = 0.7,
	mapping = aes(xend = Axis.1, yend = Axis.2),
	# Add arrow head
	arrow = arrow(length = unit(3, "mm"))) +
	labs(subtitle = "Arrows scale arbitrarily")
	```

	If we want to know the variable names we can slightly change the arrows data.frame:
	```{r arrows_df}
	arrows_df = as.data.frame(trait_pcoa_arrows$U/20)
	arrows_df$variable = rownames(arrows_df)

	plot_pcoa +
	geom_segment(data = arrows_df,
	x = 0, y = 0, alpha = 0.7,
	mapping = aes(xend = Axis.1, yend = Axis.2),
	# Add arrow head
	arrow = arrow(length = unit(3, "mm"))) +
	geom_label(data = arrows_df, aes(label = variable)) +
	labs(subtitle = "Arrows scale arbitrarily")
	```
	We have the name of the variables but we can even improve using the `ggrepel` package to avoid overlapping labels:
	```{r plot_pcoa_label_repel}
	plot_pcoa +
	geom_segment(data = arrows_df,
	x = 0, y = 0, alpha = 0.7,
	mapping = aes(xend = Axis.1, yend = Axis.2),
	# Add arrow head
	arrow = arrow(length = unit(3, "mm"))) +
	ggrepel::geom_label_repel(data = arrows_df, aes(label = variable)) +
	labs(subtitle = "Arrows scale arbitrarily")
	```

	From this we can improve the aspect of the PCoA:
	```{r plot_pcoa_aspect}
	# Add better axes names
	arrows_df$var_name = c("Max. Height",
	"Aerial Veg. Mult.",
	"Underground Veg. Mult.",
	"Seed Bank Long.",
	"Flowering Period")

	plot_pcoa +
	geom_segment(data = as.data.frame(trait_pcoa_arrows$U/20),
	x = 0, y = 0, alpha = 0.7,
	mapping = aes(xend = Axis.1, yend = Axis.2),
	# Add arrow head
	arrow = arrow(length = unit(3, "mm"))) +
	# Add Axes Labels
	ggrepel::geom_label_repel(data = arrows_df, aes(label = var_name)) +
	# Add axes
	geom_hline(yintercept = 0, linetype = 2) +
	geom_vline(xintercept = 0, linetype = 2) +
	# Keep coord equal for each axis
	coord_equal() +
	# Labs
	labs(subtitle = "Arrows scale arbitrarily",
	# Add Explained Variance per axis
	x = paste0("Axis 1 (", round(trait_pcoa_arrows$values$Relative_eig[1] * 100, 2), "%)"),
	y = paste0("Axis 2 (", round(trait_pcoa_arrows$values$Relative_eig[2] * 100, 2), "%)")) +
	# Theme change
	theme_bw()
	```


	## Wrapping up in a single function

	Now we can try to automate as many task as possible using the a single function:

	```{r final_function}
	# axis variables given the axes to show
	# the scalar helps for the visualization of the different axes
	show_final_pcoa = function(pcoa_object_with_arrows, axis1 = 1, axis2 = 2,
	variable_scalar = 20) {

	given_pcoa = pcoa_object_with_arrows

	# Store axes names
	kept_axes = paste0("Axis.", c(axis1, axis2))

	# data.frame with values from species
	pcoa_df = as.data.frame(given_pcoa$vectors)

	# data.frame with variable covariances
	arrow_df = as.data.frame(given_pcoa$U/variable_scalar)
	arrow_df$variable = rownames(arrow_df) # Add variable names

	# Eigen values per axes
	eigen_values = given_pcoa$values$Relative_eig[c(axis1, axis2)]
	names(eigen_values) = kept_axes

	# Axes labels in plot
	axes_labs = lapply(kept_axes, function(x) {
	paste0(gsub(".", " ", x, fixed = TRUE),
	" (", round(eigen_values[x] * 100, 2), "%)")
	})

	# Compute plot limits to get symetrical limits on each axis (= square plot)
	limits = apply(given_pcoa$vectors[, kept_axes], 2, range)

	x_lim = c(limits[1, 1] - diff(limits[,1])/10,
	limits[2, 1] + diff(limits[,1])/10)

	y_lim = c(limits[1, 2] - diff(limits[,2])/10,
	limits[2, 2] + diff(limits[,2])/10)



	# Final plot
	ggplot(pcoa_df, aes_string(x = kept_axes[1],
	y = kept_axes[2])) +
	# Add dashed axes
	geom_hline(yintercept = 0, linetype = 2) +
	geom_vline(xintercept = 0, linetype = 2) +
	# Add species points
	geom_point() +
	# Add arrows
	geom_segment(data = arrow_df, x = 0, y = 0, alpha = 0.7,
	arrow = arrow(length = unit(3, "mm")),
	aes_string(xend = kept_axes[1], yend = kept_axes[2])) +
	# Add arrow labels
	ggrepel::geom_label_repel(data = arrow_df, aes(label = variable),
	size = 5, fontface = "bold",
	lineheight = 0.7) +
	# Specified limits to get square plot
	# to see the effect of this you can replace this line by
	# coord_equal() +
	coord_fixed(xlim = x_lim, ylim = y_lim) +
	# Labels & Theme
	labs(x = axes_labs[[1]],
	y = axes_labs[[2]]) +
	theme_bw() +
	theme(panel.grid = element_blank())
	}
	```

	We can now test our function:
	```{r test_function}
	basic_arrow_pcoa = compute_arrows(trait_pcoa, traits)

	show_final_pcoa(basic_arrow_pcoa, 1, 2)
	show_final_pcoa(basic_arrow_pcoa, 2, 3)
	show_final_pcoa(basic_arrow_pcoa, 1, 3)
	```

	It works!
	Feel free to reuse and change the process/function ;)

	## References

	Legendre, P. and Legendre, L. (1998) Numerical Ecology, 2nd English edition. Amsterdam: Elsevier Science BV.