timedreamer · December 14, 2022 19:56 · azmigueldario · Jul 10, 2024
diff --git a/GO_anlaysis_using_topGO.R b/GO_anlaysis_using_topGO.R
 # 0. Prep -----------------------------------------------------------------

 library(tidyverse)
 library(here)

 # 1. Load and clean EggNOG output -----------------------------------------

 ## The output was downloaded from the eggnog-mapper online run.
 eggnog_go <- read_tsv(here("data", "misc", "out.emapper.annotations.gz"), 
                      skip = 4) %>% 
    dplyr::rename(query = "#query")

 eggnog_go_clean <- eggnog_go %>% dplyr::select(query, GOs)

 eggnog_go_clean <- eggnog_go_clean %>% filter(GOs != "-") %>% 
    filter(grepl(pattern = "^LOC", .$query))

 eggnog_go_clean <- eggnog_go_clean %>% 
    separate(col = query, sep = "\\.", into = c("geneID", "rm1")) %>% 
    dplyr::select(-rm1)

 write_tsv(eggnog_go_clean, 
          here("data", "misc", "EggNOG_GOv1.tsv"), col_names = FALSE)


 # 2. Run GO enrichment analysis -------------------------------------------

 library(topGO)

 ## 2.1 Prep geneID to GO table
 geneID2GO <- readMappings(file = here("data", "misc", "EggNOG_GOv1.tsv"))
 str(head(geneID2GO))

 geneNames <- names(geneID2GO)
 head(geneNames)


 length(geneID2GO) # 14036 genes have at least one GO term

 pdata <- tibble(GO_number_each_gene = lengths(geneID2GO),
                geneid = names(geneID2GO),
                group = "gene")

 pdata %>% ggplot(., aes(x= group, y = GO_number_each_gene)) +
    geom_boxplot() +
    scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x))) +
    cowplot::theme_cowplot() +
    theme(axis.text.x=element_blank(), #remove x axis labels
          axis.ticks.x=element_blank(), #remove x axis ticks
    ) +
    xlab("Gene")
    

 ## 2.2 Build the topGOdata object

 myInterestingGenes <- sample(geneNames, length(geneNames) / 10)
 geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
 names(geneList) <- geneNames


 sampleGOdata <- new("topGOdata",
                    description = "Simple test", ontology = "BP",
                    allGenes = geneList, 
                    # geneSel = topDiffGenes, # use this for GSEA
                    nodeSize = 5,
                    annot = annFUN.gene2GO, 
                    gene2GO = geneID2GO)

 sampleGOdata

 ## 2.3 Run test
 resultFisher <- runTest(sampleGOdata, algorithm = "classic", statistic = "fisher")

 # resultKS <- runTest(sampleGOdata, algorithm = "classic", statistic = "ks")
 # resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", statistic = "ks")

 ## 2.4 Get result table and plot

 allRes <- GenTable(sampleGOdata, classicFisher = resultFisher,
                   # classicKS = resultKS, elimKS = resultKS.elim,
                   ranksOf = "classicFisher", topNodes = 10)

 head(allRes)

 showSigOfNodes(sampleGOdata, score(resultFisher), firstSigNodes = 2, 
               useInfo = "all")
	# 0. Prep -----------------------------------------------------------------

	library(tidyverse)
	library(here)

	# 1. Load and clean EggNOG output -----------------------------------------

	## The output was downloaded from the eggnog-mapper online run.
	eggnog_go <- read_tsv(here("data", "misc", "out.emapper.annotations.gz"),
	skip = 4) %>%
	dplyr::rename(query = "#query")

	eggnog_go_clean <- eggnog_go %>% dplyr::select(query, GOs)

	eggnog_go_clean <- eggnog_go_clean %>% filter(GOs != "-") %>%
	filter(grepl(pattern = "^LOC", .$query))

	eggnog_go_clean <- eggnog_go_clean %>%
	separate(col = query, sep = "\\.", into = c("geneID", "rm1")) %>%
	dplyr::select(-rm1)

	write_tsv(eggnog_go_clean,
	here("data", "misc", "EggNOG_GOv1.tsv"), col_names = FALSE)


	# 2. Run GO enrichment analysis -------------------------------------------

	library(topGO)

	## 2.1 Prep geneID to GO table
	geneID2GO <- readMappings(file = here("data", "misc", "EggNOG_GOv1.tsv"))
	str(head(geneID2GO))

	geneNames <- names(geneID2GO)
	head(geneNames)


	length(geneID2GO) # 14036 genes have at least one GO term

	pdata <- tibble(GO_number_each_gene = lengths(geneID2GO),
	geneid = names(geneID2GO),
	group = "gene")

	pdata %>% ggplot(., aes(x= group, y = GO_number_each_gene)) +
	geom_boxplot() +
	scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
	labels = trans_format("log10", math_format(10^.x))) +
	cowplot::theme_cowplot() +
	theme(axis.text.x=element_blank(), #remove x axis labels
	axis.ticks.x=element_blank(), #remove x axis ticks
	) +
	xlab("Gene")


	## 2.2 Build the topGOdata object

	myInterestingGenes <- sample(geneNames, length(geneNames) / 10)
	geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
	names(geneList) <- geneNames


	sampleGOdata <- new("topGOdata",
	description = "Simple test", ontology = "BP",
	allGenes = geneList,
	# geneSel = topDiffGenes, # use this for GSEA
	nodeSize = 5,
	annot = annFUN.gene2GO,
	gene2GO = geneID2GO)

	sampleGOdata

	## 2.3 Run test
	resultFisher <- runTest(sampleGOdata, algorithm = "classic", statistic = "fisher")

	# resultKS <- runTest(sampleGOdata, algorithm = "classic", statistic = "ks")
	# resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", statistic = "ks")

	## 2.4 Get result table and plot

	allRes <- GenTable(sampleGOdata, classicFisher = resultFisher,
	# classicKS = resultKS, elimKS = resultKS.elim,
	ranksOf = "classicFisher", topNodes = 10)

	head(allRes)

	showSigOfNodes(sampleGOdata, score(resultFisher), firstSigNodes = 2,
	useInfo = "all")