Created
December 14, 2022 19:56
-
-
Save timedreamer/ba8e8243bb1c9e264a737ce1fcc53c98 to your computer and use it in GitHub Desktop.
Use topGO package for GO analysis in R.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 0. Prep ----------------------------------------------------------------- | |
library(tidyverse) | |
library(here) | |
# 1. Load and clean EggNOG output ----------------------------------------- | |
## The output was downloaded from the eggnog-mapper online run. | |
eggnog_go <- read_tsv(here("data", "misc", "out.emapper.annotations.gz"), | |
skip = 4) %>% | |
dplyr::rename(query = "#query") | |
eggnog_go_clean <- eggnog_go %>% dplyr::select(query, GOs) | |
eggnog_go_clean <- eggnog_go_clean %>% filter(GOs != "-") %>% | |
filter(grepl(pattern = "^LOC", .$query)) | |
eggnog_go_clean <- eggnog_go_clean %>% | |
separate(col = query, sep = "\\.", into = c("geneID", "rm1")) %>% | |
dplyr::select(-rm1) | |
write_tsv(eggnog_go_clean, | |
here("data", "misc", "EggNOG_GOv1.tsv"), col_names = FALSE) | |
# 2. Run GO enrichment analysis ------------------------------------------- | |
library(topGO) | |
## 2.1 Prep geneID to GO table | |
geneID2GO <- readMappings(file = here("data", "misc", "EggNOG_GOv1.tsv")) | |
str(head(geneID2GO)) | |
geneNames <- names(geneID2GO) | |
head(geneNames) | |
length(geneID2GO) # 14036 genes have at least one GO term | |
pdata <- tibble(GO_number_each_gene = lengths(geneID2GO), | |
geneid = names(geneID2GO), | |
group = "gene") | |
pdata %>% ggplot(., aes(x= group, y = GO_number_each_gene)) + | |
geom_boxplot() + | |
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), | |
labels = trans_format("log10", math_format(10^.x))) + | |
cowplot::theme_cowplot() + | |
theme(axis.text.x=element_blank(), #remove x axis labels | |
axis.ticks.x=element_blank(), #remove x axis ticks | |
) + | |
xlab("Gene") | |
## 2.2 Build the topGOdata object | |
myInterestingGenes <- sample(geneNames, length(geneNames) / 10) | |
geneList <- factor(as.integer(geneNames %in% myInterestingGenes)) | |
names(geneList) <- geneNames | |
sampleGOdata <- new("topGOdata", | |
description = "Simple test", ontology = "BP", | |
allGenes = geneList, | |
# geneSel = topDiffGenes, # use this for GSEA | |
nodeSize = 5, | |
annot = annFUN.gene2GO, | |
gene2GO = geneID2GO) | |
sampleGOdata | |
## 2.3 Run test | |
resultFisher <- runTest(sampleGOdata, algorithm = "classic", statistic = "fisher") | |
# resultKS <- runTest(sampleGOdata, algorithm = "classic", statistic = "ks") | |
# resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", statistic = "ks") | |
## 2.4 Get result table and plot | |
allRes <- GenTable(sampleGOdata, classicFisher = resultFisher, | |
# classicKS = resultKS, elimKS = resultKS.elim, | |
ranksOf = "classicFisher", topNodes = 10) | |
head(allRes) | |
showSigOfNodes(sampleGOdata, score(resultFisher), firstSigNodes = 2, | |
useInfo = "all") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Really helpful to organize any EggNOG results. I produced a plot with the top GO from a
multifasta.fa
file of CDS.