DannyArends · June 29, 2024 09:31
diff --git a/Enrichment.R b/Enrichment.R
 #
 # Code for the lecture on "Gene Ontology from Scratch"
 # copyright (c) 2020-2024 Danny Arends
 #
 #     This program is free software; you can redistribute it and/or modify it under the terms of
 #     the GNU General Public License,version 3, as published by the Free Software Foundation.
 #     This program is distributed in the hope that it will be useful, but without any warranty;
 #     without even the implied warranty of merchantability or fitness for a particular purpose.
 #
 #     For more details see: www.gnu.org/licenses/gpl-3.0.en.html
 #

 # Required R packages
 #BiocManager::install("preprocessCore", configure.args = c(preprocessCore = "--disable-threading"), force=TRUE, update=TRUE, type = "source")
 #BiocManager::install(c("affy","limma"))
 setwd("C:/Users/Danny/Documents/GO ORA")

 # Libraries used for differentially expressed genes (DEGs)
 library(affy)
 library(limma)
 library(preprocessCore)

 # First we define our samples (www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE1676), Published under PMID: 15892871
 # 4-Hour treatment with neocarzinostatin (NCS)
 samples <- data.frame(cbind(
  Type = rep("control", 6),
  Time = c(rep("0", 3), rep("4", 3))
 ))
 rownames(samples) <- c("GSM28627", "GSM28628", "GSM28629", 
                       "GSM28630", "GSM28631", "GSM28632")

 # Download the CEL files from GEO at www.ncbi.nlm.nih.gov
 geo <- "https://www.ncbi.nlm.nih.gov/geo/download/"
 for (ID in rownames(samples)) {
  uri <- paste0(geo, "?acc=", ID, "&format=file&file=", ID, "%2ECEL%2Egz")
  fp <- paste0(ID, ".CEL.gz")
  if (!file.exists(fp)) {
    download.file(uri, fp)      # Downloads the file to the current folder
    Sys.sleep(5)                # Be nice to the server
  }
 }

 # Read the .CEL.gz files & Robust Multi-Array Average (RMA) expression
 cels <- ReadAffy(phenoData=AnnotatedDataFrame(data = samples))
 eset <- rma(cels)

 # Model the DEGs
 design <- model.matrix(~ Time, data = pData(eset))
 fit <- eBayes(lmFit(eset, design = design))

 # Get all p-values & create a Volcano plot
 degs <- topTable(fit, number = Inf)

 plot(x = degs[, "logFC"], y = -log10(degs[, "adj.P.Val"]), pch = 19, yaxs= "i", 
     xlab = "LogFC", ylab = "-log10(Padj)", main = "Volcano plot")
 abline(h = -log10(c(0.05, 0.01)), lty = 2, col = c(3, 5))

 # Download the array annotation file
 download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?mode=raw&is_datatable=true&acc=GPL201&id=30390&db=GeoDb_blob144", "GPL201-30390.txt")

 # Read the annotation & annotate the probes
 GPL201 <- read.table("GPL201-30390.txt", header=TRUE, sep = "\t",
                     skip = 16, quote="", row.names = 1)
 degs <- cbind(symbol = GPL201[rownames(degs), c("Gene.Symbol")],
                       degs[, c("logFC", "P.Value", "adj.P.Val")])

 # Show the first 5 DEGs
 degs[1:5,]

 # Helper function to parse the Gene Ontology data given a certain probe
 getGO <- function (probe, ontology = "Gene.Ontology.Biological.Process") {
  split <- strsplit(GPL201[probe, ontology], "///")[[1]]
  gos <- trimws(unlist(lapply(strsplit(split, "//"), "[", 1)))
  des <- trimws(unlist(lapply(strsplit(split, "//"), "[", 2)))
  if (length(gos) > 0) {
    gos <- cbind(go = paste0("GO:", gos), descr = des)
    rownames(gos) <- rep(probe, nrow(gos))
    return(gos)
  }
  return() # Probe/Gene has no Gene Ontology terms
 }

 # Build the matrix of all GO terms for all probes / genes
 GOs <- c()
 for(probe in rownames(degs)){ GOs <- rbind(GOs, getGO(probe)) }

 # We can only use DEGs which have a Gene Ontology annotation
 degs <- degs[which(rownames(degs) %in% rownames(GOs)),]

 # Define our foreground (Significant DEG) and background (All)
 fg <- rownames(degs)[which(degs[, "adj.P.Val"] < 0.5)]
 fg.go <- GOs[fg, "go"]
 bg <- rownames(degs)
 bg.go <- na.omit(GOs[bg, "go"])

 ## Over / Under representation analysis of Gene Ontology
 ORA <- c()
 for (go in unique(fg.go)) {
  x <- length(unique(names(fg.go)[which(fg.go == go)]))    # FG probes with this GO
  m <- length(unique(names(bg.go)[which(bg.go == go)]))    # BG probes with this GO
  n <- length(unique(names(bg.go))) - m                    # BG probes without this GO
  k <- length(unique(names(fg.go)))                        # All FG probes

  ORA <- rbind(ORA, c(x, m, n + m, k, phyper(x - 1, m, n, k, lower.tail=FALSE)))
 }
 rownames(ORA) <- unique(fg.go)
 colnames(ORA) <- c("x", "m", "n+m", "k", "P")
 ORA <- cbind(ORA, "Padj" = p.adjust(ORA[, "P"], "BH"))

 # Significant OR based on the BH adjusted P-value
 isS <- rownames(ORA)[which(ORA[, "Padj"] < 0.05)]

 # Annotation of significant GO terms
 GOnames <- c()
 for(i in isS) { 
  GOnames <- c(GOnames, unique(GOs[which(GOs[,"go"] == i), "descr"]))
 }
 ORA <- cbind(Description = GOnames, ORA[isS,])

 # FG probes with this GO / All FG probes
 x1 <- as.numeric(ORA[,"x"]) / as.numeric(ORA[,"k"])

 # BG probes with this GO / All BG probes
 x2 <- as.numeric(ORA[,"m"]) / as.numeric(ORA[,"n+m"])

 # Expected based on the background
 exp <- round(x2 * as.numeric(ORA[,"k"]), 3)

 # Build the output matrix
 ORA <- data.frame(cbind(Description = ORA[,"Description"],
                        Expected = paste0(exp, "/", ORA[,"k"]),
                        Observed = paste0(ORA[,"x"], "/", ORA[,"k"]),
                        Background = paste0(ORA[,"m"], "/",ORA[,"n+m"]), 
                        FoldChange = round(x1/x2, 2), 
                        P = ORA[, "P"], 
                        Padj = ORA[,"Padj"]))
 ORA
	#
	# Code for the lecture on "Gene Ontology from Scratch"
	# copyright (c) 2020-2024 Danny Arends
	#
	# This program is free software; you can redistribute it and/or modify it under the terms of
	# the GNU General Public License,version 3, as published by the Free Software Foundation.
	# This program is distributed in the hope that it will be useful, but without any warranty;
	# without even the implied warranty of merchantability or fitness for a particular purpose.
	#
	# For more details see: www.gnu.org/licenses/gpl-3.0.en.html
	#

	# Required R packages
	#BiocManager::install("preprocessCore", configure.args = c(preprocessCore = "--disable-threading"), force=TRUE, update=TRUE, type = "source")
	#BiocManager::install(c("affy","limma"))
	setwd("C:/Users/Danny/Documents/GO ORA")

	# Libraries used for differentially expressed genes (DEGs)
	library(affy)
	library(limma)
	library(preprocessCore)

	# First we define our samples (www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE1676), Published under PMID: 15892871
	# 4-Hour treatment with neocarzinostatin (NCS)
	samples <- data.frame(cbind(
	Type = rep("control", 6),
	Time = c(rep("0", 3), rep("4", 3))
	))
	rownames(samples) <- c("GSM28627", "GSM28628", "GSM28629",
	"GSM28630", "GSM28631", "GSM28632")

	# Download the CEL files from GEO at www.ncbi.nlm.nih.gov
	geo <- "https://www.ncbi.nlm.nih.gov/geo/download/"
	for (ID in rownames(samples)) {
	uri <- paste0(geo, "?acc=", ID, "&format=file&file=", ID, "%2ECEL%2Egz")
	fp <- paste0(ID, ".CEL.gz")
	if (!file.exists(fp)) {
	download.file(uri, fp) # Downloads the file to the current folder
	Sys.sleep(5) # Be nice to the server
	}
	}

	# Read the .CEL.gz files & Robust Multi-Array Average (RMA) expression
	cels <- ReadAffy(phenoData=AnnotatedDataFrame(data = samples))
	eset <- rma(cels)

	# Model the DEGs
	design <- model.matrix(~ Time, data = pData(eset))
	fit <- eBayes(lmFit(eset, design = design))

	# Get all p-values & create a Volcano plot
	degs <- topTable(fit, number = Inf)

	plot(x = degs[, "logFC"], y = -log10(degs[, "adj.P.Val"]), pch = 19, yaxs= "i",
	xlab = "LogFC", ylab = "-log10(Padj)", main = "Volcano plot")
	abline(h = -log10(c(0.05, 0.01)), lty = 2, col = c(3, 5))

	# Download the array annotation file
	download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?mode=raw&is_datatable=true&acc=GPL201&id=30390&db=GeoDb_blob144", "GPL201-30390.txt")

	# Read the annotation & annotate the probes
	GPL201 <- read.table("GPL201-30390.txt", header=TRUE, sep = "\t",
	skip = 16, quote="", row.names = 1)
	degs <- cbind(symbol = GPL201[rownames(degs), c("Gene.Symbol")],
	degs[, c("logFC", "P.Value", "adj.P.Val")])

	# Show the first 5 DEGs
	degs[1:5,]

	# Helper function to parse the Gene Ontology data given a certain probe
	getGO <- function (probe, ontology = "Gene.Ontology.Biological.Process") {
	split <- strsplit(GPL201[probe, ontology], "///")[[1]]
	gos <- trimws(unlist(lapply(strsplit(split, "//"), "[", 1)))
	des <- trimws(unlist(lapply(strsplit(split, "//"), "[", 2)))
	if (length(gos) > 0) {
	gos <- cbind(go = paste0("GO:", gos), descr = des)
	rownames(gos) <- rep(probe, nrow(gos))
	return(gos)
	}
	return() # Probe/Gene has no Gene Ontology terms
	}

	# Build the matrix of all GO terms for all probes / genes
	GOs <- c()
	for(probe in rownames(degs)){ GOs <- rbind(GOs, getGO(probe)) }

	# We can only use DEGs which have a Gene Ontology annotation
	degs <- degs[which(rownames(degs) %in% rownames(GOs)),]

	# Define our foreground (Significant DEG) and background (All)
	fg <- rownames(degs)[which(degs[, "adj.P.Val"] < 0.5)]
	fg.go <- GOs[fg, "go"]
	bg <- rownames(degs)
	bg.go <- na.omit(GOs[bg, "go"])

	## Over / Under representation analysis of Gene Ontology
	ORA <- c()
	for (go in unique(fg.go)) {
	x <- length(unique(names(fg.go)[which(fg.go == go)])) # FG probes with this GO
	m <- length(unique(names(bg.go)[which(bg.go == go)])) # BG probes with this GO
	n <- length(unique(names(bg.go))) - m # BG probes without this GO
	k <- length(unique(names(fg.go))) # All FG probes

	ORA <- rbind(ORA, c(x, m, n + m, k, phyper(x - 1, m, n, k, lower.tail=FALSE)))
	}
	rownames(ORA) <- unique(fg.go)
	colnames(ORA) <- c("x", "m", "n+m", "k", "P")
	ORA <- cbind(ORA, "Padj" = p.adjust(ORA[, "P"], "BH"))

	# Significant OR based on the BH adjusted P-value
	isS <- rownames(ORA)[which(ORA[, "Padj"] < 0.05)]

	# Annotation of significant GO terms
	GOnames <- c()
	for(i in isS) {
	GOnames <- c(GOnames, unique(GOs[which(GOs[,"go"] == i), "descr"]))
	}
	ORA <- cbind(Description = GOnames, ORA[isS,])

	# FG probes with this GO / All FG probes
	x1 <- as.numeric(ORA[,"x"]) / as.numeric(ORA[,"k"])

	# BG probes with this GO / All BG probes
	x2 <- as.numeric(ORA[,"m"]) / as.numeric(ORA[,"n+m"])

	# Expected based on the background
	exp <- round(x2 * as.numeric(ORA[,"k"]), 3)

	# Build the output matrix
	ORA <- data.frame(cbind(Description = ORA[,"Description"],
	Expected = paste0(exp, "/", ORA[,"k"]),
	Observed = paste0(ORA[,"x"], "/", ORA[,"k"]),
	Background = paste0(ORA[,"m"], "/",ORA[,"n+m"]),
	FoldChange = round(x1/x2, 2),
	P = ORA[, "P"],
	Padj = ORA[,"Padj"]))
	ORA