Last active
June 29, 2024 09:31
-
-
Save DannyArends/75b8c64264fbe72ff194ce45f50d2637 to your computer and use it in GitHub Desktop.
Code for the YouTube lecture "Enrichment Analysis from Scratch" (https://www.youtube.com/live/MJ4A5fmgWhg)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Code for the lecture on "Gene Ontology from Scratch" | |
# copyright (c) 2020-2024 Danny Arends | |
# | |
# This program is free software; you can redistribute it and/or modify it under the terms of | |
# the GNU General Public License,version 3, as published by the Free Software Foundation. | |
# This program is distributed in the hope that it will be useful, but without any warranty; | |
# without even the implied warranty of merchantability or fitness for a particular purpose. | |
# | |
# For more details see: www.gnu.org/licenses/gpl-3.0.en.html | |
# | |
# Required R packages | |
#BiocManager::install("preprocessCore", configure.args = c(preprocessCore = "--disable-threading"), force=TRUE, update=TRUE, type = "source") | |
#BiocManager::install(c("affy","limma")) | |
setwd("C:/Users/Danny/Documents/GO ORA") | |
# Libraries used for differentially expressed genes (DEGs) | |
library(affy) | |
library(limma) | |
library(preprocessCore) | |
# First we define our samples (www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE1676), Published under PMID: 15892871 | |
# 4-Hour treatment with neocarzinostatin (NCS) | |
samples <- data.frame(cbind( | |
Type = rep("control", 6), | |
Time = c(rep("0", 3), rep("4", 3)) | |
)) | |
rownames(samples) <- c("GSM28627", "GSM28628", "GSM28629", | |
"GSM28630", "GSM28631", "GSM28632") | |
# Download the CEL files from GEO at www.ncbi.nlm.nih.gov | |
geo <- "https://www.ncbi.nlm.nih.gov/geo/download/" | |
for (ID in rownames(samples)) { | |
uri <- paste0(geo, "?acc=", ID, "&format=file&file=", ID, "%2ECEL%2Egz") | |
fp <- paste0(ID, ".CEL.gz") | |
if (!file.exists(fp)) { | |
download.file(uri, fp) # Downloads the file to the current folder | |
Sys.sleep(5) # Be nice to the server | |
} | |
} | |
# Read the .CEL.gz files & Robust Multi-Array Average (RMA) expression | |
cels <- ReadAffy(phenoData=AnnotatedDataFrame(data = samples)) | |
eset <- rma(cels) | |
# Model the DEGs | |
design <- model.matrix(~ Time, data = pData(eset)) | |
fit <- eBayes(lmFit(eset, design = design)) | |
# Get all p-values & create a Volcano plot | |
degs <- topTable(fit, number = Inf) | |
plot(x = degs[, "logFC"], y = -log10(degs[, "adj.P.Val"]), pch = 19, yaxs= "i", | |
xlab = "LogFC", ylab = "-log10(Padj)", main = "Volcano plot") | |
abline(h = -log10(c(0.05, 0.01)), lty = 2, col = c(3, 5)) | |
# Download the array annotation file | |
download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?mode=raw&is_datatable=true&acc=GPL201&id=30390&db=GeoDb_blob144", "GPL201-30390.txt") | |
# Read the annotation & annotate the probes | |
GPL201 <- read.table("GPL201-30390.txt", header=TRUE, sep = "\t", | |
skip = 16, quote="", row.names = 1) | |
degs <- cbind(symbol = GPL201[rownames(degs), c("Gene.Symbol")], | |
degs[, c("logFC", "P.Value", "adj.P.Val")]) | |
# Show the first 5 DEGs | |
degs[1:5,] | |
# Helper function to parse the Gene Ontology data given a certain probe | |
getGO <- function (probe, ontology = "Gene.Ontology.Biological.Process") { | |
split <- strsplit(GPL201[probe, ontology], "///")[[1]] | |
gos <- trimws(unlist(lapply(strsplit(split, "//"), "[", 1))) | |
des <- trimws(unlist(lapply(strsplit(split, "//"), "[", 2))) | |
if (length(gos) > 0) { | |
gos <- cbind(go = paste0("GO:", gos), descr = des) | |
rownames(gos) <- rep(probe, nrow(gos)) | |
return(gos) | |
} | |
return() # Probe/Gene has no Gene Ontology terms | |
} | |
# Build the matrix of all GO terms for all probes / genes | |
GOs <- c() | |
for(probe in rownames(degs)){ GOs <- rbind(GOs, getGO(probe)) } | |
# We can only use DEGs which have a Gene Ontology annotation | |
degs <- degs[which(rownames(degs) %in% rownames(GOs)),] | |
# Define our foreground (Significant DEG) and background (All) | |
fg <- rownames(degs)[which(degs[, "adj.P.Val"] < 0.5)] | |
fg.go <- GOs[fg, "go"] | |
bg <- rownames(degs) | |
bg.go <- na.omit(GOs[bg, "go"]) | |
## Over / Under representation analysis of Gene Ontology | |
ORA <- c() | |
for (go in unique(fg.go)) { | |
x <- length(unique(names(fg.go)[which(fg.go == go)])) # FG probes with this GO | |
m <- length(unique(names(bg.go)[which(bg.go == go)])) # BG probes with this GO | |
n <- length(unique(names(bg.go))) - m # BG probes without this GO | |
k <- length(unique(names(fg.go))) # All FG probes | |
ORA <- rbind(ORA, c(x, m, n + m, k, phyper(x - 1, m, n, k, lower.tail=FALSE))) | |
} | |
rownames(ORA) <- unique(fg.go) | |
colnames(ORA) <- c("x", "m", "n+m", "k", "P") | |
ORA <- cbind(ORA, "Padj" = p.adjust(ORA[, "P"], "BH")) | |
# Significant OR based on the BH adjusted P-value | |
isS <- rownames(ORA)[which(ORA[, "Padj"] < 0.05)] | |
# Annotation of significant GO terms | |
GOnames <- c() | |
for(i in isS) { | |
GOnames <- c(GOnames, unique(GOs[which(GOs[,"go"] == i), "descr"])) | |
} | |
ORA <- cbind(Description = GOnames, ORA[isS,]) | |
# FG probes with this GO / All FG probes | |
x1 <- as.numeric(ORA[,"x"]) / as.numeric(ORA[,"k"]) | |
# BG probes with this GO / All BG probes | |
x2 <- as.numeric(ORA[,"m"]) / as.numeric(ORA[,"n+m"]) | |
# Expected based on the background | |
exp <- round(x2 * as.numeric(ORA[,"k"]), 3) | |
# Build the output matrix | |
ORA <- data.frame(cbind(Description = ORA[,"Description"], | |
Expected = paste0(exp, "/", ORA[,"k"]), | |
Observed = paste0(ORA[,"x"], "/", ORA[,"k"]), | |
Background = paste0(ORA[,"m"], "/",ORA[,"n+m"]), | |
FoldChange = round(x1/x2, 2), | |
P = ORA[, "P"], | |
Padj = ORA[,"Padj"])) | |
ORA |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment