Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Last active June 29, 2024 09:31
Show Gist options
  • Save DannyArends/75b8c64264fbe72ff194ce45f50d2637 to your computer and use it in GitHub Desktop.
Save DannyArends/75b8c64264fbe72ff194ce45f50d2637 to your computer and use it in GitHub Desktop.
Code for the YouTube lecture "Enrichment Analysis from Scratch" (https://www.youtube.com/live/MJ4A5fmgWhg)
#
# Code for the lecture on "Gene Ontology from Scratch"
# copyright (c) 2020-2024 Danny Arends
#
# This program is free software; you can redistribute it and/or modify it under the terms of
# the GNU General Public License,version 3, as published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful, but without any warranty;
# without even the implied warranty of merchantability or fitness for a particular purpose.
#
# For more details see: www.gnu.org/licenses/gpl-3.0.en.html
#
# Required R packages
#BiocManager::install("preprocessCore", configure.args = c(preprocessCore = "--disable-threading"), force=TRUE, update=TRUE, type = "source")
#BiocManager::install(c("affy","limma"))
setwd("C:/Users/Danny/Documents/GO ORA")
# Libraries used for differentially expressed genes (DEGs)
library(affy)
library(limma)
library(preprocessCore)
# First we define our samples (www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE1676), Published under PMID: 15892871
# 4-Hour treatment with neocarzinostatin (NCS)
samples <- data.frame(cbind(
Type = rep("control", 6),
Time = c(rep("0", 3), rep("4", 3))
))
rownames(samples) <- c("GSM28627", "GSM28628", "GSM28629",
"GSM28630", "GSM28631", "GSM28632")
# Download the CEL files from GEO at www.ncbi.nlm.nih.gov
geo <- "https://www.ncbi.nlm.nih.gov/geo/download/"
for (ID in rownames(samples)) {
uri <- paste0(geo, "?acc=", ID, "&format=file&file=", ID, "%2ECEL%2Egz")
fp <- paste0(ID, ".CEL.gz")
if (!file.exists(fp)) {
download.file(uri, fp) # Downloads the file to the current folder
Sys.sleep(5) # Be nice to the server
}
}
# Read the .CEL.gz files & Robust Multi-Array Average (RMA) expression
cels <- ReadAffy(phenoData=AnnotatedDataFrame(data = samples))
eset <- rma(cels)
# Model the DEGs
design <- model.matrix(~ Time, data = pData(eset))
fit <- eBayes(lmFit(eset, design = design))
# Get all p-values & create a Volcano plot
degs <- topTable(fit, number = Inf)
plot(x = degs[, "logFC"], y = -log10(degs[, "adj.P.Val"]), pch = 19, yaxs= "i",
xlab = "LogFC", ylab = "-log10(Padj)", main = "Volcano plot")
abline(h = -log10(c(0.05, 0.01)), lty = 2, col = c(3, 5))
# Download the array annotation file
download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?mode=raw&is_datatable=true&acc=GPL201&id=30390&db=GeoDb_blob144", "GPL201-30390.txt")
# Read the annotation & annotate the probes
GPL201 <- read.table("GPL201-30390.txt", header=TRUE, sep = "\t",
skip = 16, quote="", row.names = 1)
degs <- cbind(symbol = GPL201[rownames(degs), c("Gene.Symbol")],
degs[, c("logFC", "P.Value", "adj.P.Val")])
# Show the first 5 DEGs
degs[1:5,]
# Helper function to parse the Gene Ontology data given a certain probe
getGO <- function (probe, ontology = "Gene.Ontology.Biological.Process") {
split <- strsplit(GPL201[probe, ontology], "///")[[1]]
gos <- trimws(unlist(lapply(strsplit(split, "//"), "[", 1)))
des <- trimws(unlist(lapply(strsplit(split, "//"), "[", 2)))
if (length(gos) > 0) {
gos <- cbind(go = paste0("GO:", gos), descr = des)
rownames(gos) <- rep(probe, nrow(gos))
return(gos)
}
return() # Probe/Gene has no Gene Ontology terms
}
# Build the matrix of all GO terms for all probes / genes
GOs <- c()
for(probe in rownames(degs)){ GOs <- rbind(GOs, getGO(probe)) }
# We can only use DEGs which have a Gene Ontology annotation
degs <- degs[which(rownames(degs) %in% rownames(GOs)),]
# Define our foreground (Significant DEG) and background (All)
fg <- rownames(degs)[which(degs[, "adj.P.Val"] < 0.5)]
fg.go <- GOs[fg, "go"]
bg <- rownames(degs)
bg.go <- na.omit(GOs[bg, "go"])
## Over / Under representation analysis of Gene Ontology
ORA <- c()
for (go in unique(fg.go)) {
x <- length(unique(names(fg.go)[which(fg.go == go)])) # FG probes with this GO
m <- length(unique(names(bg.go)[which(bg.go == go)])) # BG probes with this GO
n <- length(unique(names(bg.go))) - m # BG probes without this GO
k <- length(unique(names(fg.go))) # All FG probes
ORA <- rbind(ORA, c(x, m, n + m, k, phyper(x - 1, m, n, k, lower.tail=FALSE)))
}
rownames(ORA) <- unique(fg.go)
colnames(ORA) <- c("x", "m", "n+m", "k", "P")
ORA <- cbind(ORA, "Padj" = p.adjust(ORA[, "P"], "BH"))
# Significant OR based on the BH adjusted P-value
isS <- rownames(ORA)[which(ORA[, "Padj"] < 0.05)]
# Annotation of significant GO terms
GOnames <- c()
for(i in isS) {
GOnames <- c(GOnames, unique(GOs[which(GOs[,"go"] == i), "descr"]))
}
ORA <- cbind(Description = GOnames, ORA[isS,])
# FG probes with this GO / All FG probes
x1 <- as.numeric(ORA[,"x"]) / as.numeric(ORA[,"k"])
# BG probes with this GO / All BG probes
x2 <- as.numeric(ORA[,"m"]) / as.numeric(ORA[,"n+m"])
# Expected based on the background
exp <- round(x2 * as.numeric(ORA[,"k"]), 3)
# Build the output matrix
ORA <- data.frame(cbind(Description = ORA[,"Description"],
Expected = paste0(exp, "/", ORA[,"k"]),
Observed = paste0(ORA[,"x"], "/", ORA[,"k"]),
Background = paste0(ORA[,"m"], "/",ORA[,"n+m"]),
FoldChange = round(x1/x2, 2),
P = ORA[, "P"],
Padj = ORA[,"Padj"]))
ORA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment