Skip to content

Instantly share code, notes, and snippets.

@yulijia
Created February 9, 2021 13:11
Show Gist options
  • Save yulijia/0ffd031af93e2d3cbd2ed42bb6cf54b3 to your computer and use it in GitHub Desktop.
Save yulijia/0ffd031af93e2d3cbd2ed42bb6cf54b3 to your computer and use it in GitHub Desktop.
Bioinformatics Analysis on expression data
Category Term Count PValue
GO_MF GO:0030165~PDZ domain binding 5 0.003597602
GO_CC GO:0005576~extracellular region 23 0.003966517
GO_CC GO:0016021~integral component of membrane 53 0.007762346
GO_BP GO:2000820~negative regulation of transcription from RNA polymerase II promoter involved in smooth muscle cell differentiation 2 0.015069497
GO_BP GO:0036304~umbilical cord morphogenesis 2 0.015069497
GO_BP GO:0035987~endodermal cell differentiation 3 0.017605976
GO_BP GO:0060412~ventricular septum morphogenesis 3 0.020166114
GO_BP GO:0048008~platelet-derived growth factor receptor signaling pathway 3 0.020166114
GO_CC GO:0009986~cell surface 10 0.020464569
GO_MF GO:0035939~microsatellite binding 2 0.021701287
GO_BP GO:2001212~regulation of vasculogenesis 2 0.022519537
GO_BP GO:0071673~positive regulation of smooth muscle cell chemotaxis 2 0.022519537
GO_CC GO:0031012~extracellular matrix 7 0.0237569
GO_BP GO:0002027~regulation of heart rate 3 0.025717694
GO_CC GO:0042383~sarcolemma 4 0.025758908
GO_BP GO:0009968~negative regulation of signal transduction 3 0.027191909
GO_BP GO:0001525~angiogenesis 6 0.02727028
GO_BP GO:0035910~ascending aorta morphogenesis 2 0.029913665
GO_BP GO:0048619~embryonic hindgut morphogenesis 2 0.029913665
GO_BP GO:0060842~arterial endothelial cell differentiation 2 0.029913665
GO_BP GO:0030177~positive regulation of Wnt signaling pathway 3 0.030239789
GO_BP GO:0048813~dendrite morphogenesis 3 0.031812301
GO_CC GO:0005796~Golgi lumen 4 0.035143139
GO_MF GO:0000988~transcription factor activity, protein binding 2 0.035908636
GO_BP GO:0006167~AMP biosynthetic process 2 0.037252298
GO_CC GO:0005886~plasma membrane 41 0.038858292
GO_CC GO:0005615~extracellular space 17 0.043340256
GO_CC GO:0005783~endoplasmic reticulum 12 0.045756018
library(GEOquery)
library(limma)
library(pheatmap)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tidyverse)
library(enrichplot)
# load series and platform data from GEO
gset <- getGEO("GSE122220", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL10558", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
gset.bk=gset
# # group membership for all samples
# gsms <- "0111110100"
# sml <- strsplit(gsms, split="")[[1]]
# group membership for all samples
gsms <- "011X110100"
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
# log2 transformation
ex <- exprs(gset)
meta <- fData(gset)
meta=meta[!is.na(meta$Gene.ID),]
exn=(ex[row.names(ex)%in%meta$ID,])
extab=data.frame(exn,ID=row.names(exn))
exprSet <- merge(meta[,c("ID","Gene.ID")],extab,by='ID')
exprSet.final <- extab %>%
inner_join(meta[,c("ID","Gene.ID")],by="ID") %>%
# select(-ID) %>%
select(Gene.ID, everything()) %>%
mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>%
arrange(desc(rowMean)) %>%
distinct(Gene.ID,.keep_all = T) %>%
select(-rowMean)
exn.table=as.matrix(exprSet.final[,grepl("GSM",names(exprSet.final))])
row.names(exn.table)=exprSet.final$ID
qx <- as.numeric(quantile(exn.table, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
## find NA value
if (LogC) { exn[which(exn.table <= 0)] <- NaN;
dat=log2(exn.table) }
## re-create eSet
gset=ExpressionSet(assayData=dat[meta[meta$ID%in%exprSet.final$ID,]$ID,],featureData=AnnotatedDataFrame(meta[meta$ID%in%exprSet.final$ID,]),phenoData=phenoData(gset))
# dat = exprs(gset)
dat %>% as.data.frame()%>% filter_all(any_vars(is.na(.)))
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("response","non-response"))
levels(gs) <- groups
gset$group <- gs
# Value 0 corresponds to the intercept (if any), and positive values to terms in the order given by the term
design <- model.matrix(~group + 0,gset)
colnames(design) <- levels(gs)
fit <- limma::lmFit(gset, design) # fit linear model
# set up contrasts of interest and recalculate model coefficients
cts <- c(paste(groups[1],"-",groups[2],sep=""))
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT=topTable(fit2,sort="none",n=Inf)
write.table(tT, file="bio_GSE122220.tsv", row.names=F, sep="\t")
# summarize test results as "up", "down" or "not expressed"
dT = decideTests(fit2, adjust.method="none", p.value=0.05,lfc=1.1)
df = cbind(tT,dT)
ggplot(df) +
geom_point(aes(x=logFC, y=-log10(P.Value), colour=as.factor(`response-non.response`))) +
ggtitle("Differential expressed gene") +
xlab("log2 fold change") +
ylab("-log10 p-value") +
theme(legend.position = "none",
plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25))) +
geom_hline(yintercept=-1*log10(0.05),color = "black")+
geom_vline(xintercept = -1.1,color = "black")+
geom_vline(xintercept = 1.1,color = "black")
degene=df[df$`response-non.response`!=0,]$ID
annotation_col=data.frame(groups = gs)
row.names(annotation_col)=gset$geo_accession
pheatmap(dat[row.names(dat)%in%df[df$`response-non.response`!=0,]$ID,],
cluster_cols = T,cluster_rows = T,show_rownames = F,annotation_col = annotation_col)
select=df[df$`response-non.response`!=0,]
write.table(df[df$`response-non.response`!=0,],"./DE_gene.tsv",sep="\t",row.names = F)
length(select$Gene.ID)
ego.bp <- enrichGO(gene = select[select$`response-non.response`==1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.mf <- enrichGO(gene = select[select$`response-non.response`==1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.cc <- enrichGO(gene = select[select$`response-non.response`==1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.bp <- enrichGO(gene = select[select$`response-non.response`==-1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.mf <- enrichGO(gene = select[select$`response-non.response`==-1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.cc <- enrichGO(gene = select[select$`response-non.response`==-1,]$Gene.ID,
universe = as.character(df$Gene.ID),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 1,
readable = TRUE)
ego.BP <- enrichGO(gene = select$Gene.ID,
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 0.05,
readable = TRUE)
dim(ego.BP@result[ego.BP@result$pvalue<=0.05,])
dim(ego.BP@result[ego.BP@result$qvalue<=0.05,])
# barplot(ego.BP, showCategory=50) + ggtitle("barplot of enriched BP terms")
# dotplot(ego.BP)
ego.MF <- enrichGO(gene = select$Gene.ID,
OrgDb = org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 0.05,
readable = TRUE)
dim(ego.MF@result[ego.MF@result$pvalue<=0.05,])
dim(ego.MF@result[ego.MF@result$qvalue<=0.05,])
barplot(ego.MF, showCategory=50) + ggtitle("barplot of enriched MF terms")
ego.CC <- enrichGO(gene = select$Gene.ID,
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 0.05,
readable = TRUE)
dim(ego.CC@result[ego.CC@result$pvalue<=0.05,])
dim(ego.CC@result[ego.CC@result$qvalue<=0.05,])
barplot(ego.CC, showCategory=50) + ggtitle("barplot of enriched CC terms")
genelist=select[, "P.Value"]
names(genelist)=select[, "Gene.ID"]
genelist = sort(genelist, decreasing = TRUE)
# gse.re=gseGO(geneList = genelist,ont = "ALL",OrgDb=org.Hs.eg.db,pvalueCutoff = 0.1)
# ridgeplot(gse.re)
gse.BP=gseGO(geneList = genelist,ont = "BP",OrgDb= org.Hs.eg.db,pvalueCutoff = 0.1)
ridgeplot(gse.BP)
# p1 <- gseaplot(gse.BP, geneSetID = 1, by = "runningScore", title = gse.BP$Description[1])
# p2 <- gseaplot(gse.BP, geneSetID = 1, by = "preranked", title = gse.BP$Description[1])
# p3 <- gseaplot(gse.BP, geneSetID = 1, title = gse.BP$Description[1])
gseaplot2(gse.BP, geneSetID = 1, title = gse.BP$Description[1])
gse.CC=gseGO(geneList = genelist,ont = "CC",OrgDb= org.Hs.eg.db,pvalueCutoff = 0.1)
# ridgeplot(gse.CC)
gse.MF=gseGO(geneList = genelist,ont = "MF",OrgDb= org.Hs.eg.db,pvalueCutoff = 0.1)
ridgeplot(gse.MF)
gseaplot2(gse.MF, geneSetID = 1, title = gse.MF$Description[1])
gseaplot2(gse.MF, geneSetID = 2, title = gse.MF$Description[2])
kk <- enrichKEGG(gene = select$Gene.ID,
organism = 'hsa',
pvalueCutoff = 0.1)
barplot(kk)+ ggtitle("barplot of enriched KEGG pathway")
kk2 <- gseKEGG(geneList = genelist,
organism = 'hsa',
pvalueCutoff = 0.1,
verbose = FALSE)
gseaplot2(kk2, geneSetID = 1, title = kk2$Description[1])
mkk <- enrichMKEGG(gene = select$Gene.ID,
organism = 'hsa',
pvalueCutoff = 0.1)
mkk2 <- gseMKEGG(geneList = genelist,
organism = 'hsa', pvalueCutoff = 0.1)
a=read.table("./function_annotation.txt",sep="\t",header = T)
ggplot(data=a, aes(x=Count, y=reorder(Term,-PValue),fill=PValue)) +
geom_bar(stat="identity") +
ggtitle("Function annotation") +
xlab("Gene count") +
ylab(NULL) +scale_fill_gradient(low="blue", high="red")
a=read.table("./up.txt",sep="\t",header = T)
ggplot(data=a, aes(x=Count, y=reorder(Term,-PValue),fill=PValue)) +
geom_bar(stat="identity") +
ggtitle("Function annotation") +
xlab("Gene count") +
ylab(NULL) +scale_fill_gradient(low="blue", high="red")+facet_grid(Category~.,scales="free",space="free")
a=read.table("./down.txt",sep="\t",header = T)
ggplot(data=a, aes(x=Count, y=reorder(Term,-PValue),fill=PValue)) +
geom_bar(stat="identity") +
ggtitle("Function annotation") +
xlab("Gene count") +
ylab(NULL) +scale_fill_gradient(low="blue", high="red")+
facet_grid(Category~.,scales="free",space="free")
Category Term Count PValue
KEGG hsa05221:Acute myeloid leukemia 4 0.002345308
GO_MF GO:0005524~ATP binding 13 0.013790123
GO_MF GO:0031782~type 4 melanocortin receptor binding 2 0.015782965
GO_MF GO:0031781~type 3 melanocortin receptor binding 2 0.015782965
GO_BP GO:0010626~negative regulation of Schwann cell proliferation 2 0.016336818
GO_BP GO:0009408~response to heat 3 0.016619629
GO_BP GO:0006091~generation of precursor metabolites and energy 3 0.020038205
GO_CC GO:0005829~cytosol 21 0.038827297
GO_BP GO:0090084~negative regulation of inclusion body assembly 2 0.040350065
GO_MF GO:0043225~anion transmembrane-transporting ATPase activity 2 0.042814823
GO_BP GO:0046718~viral entry into host cell 3 0.042820725
GO_MF GO:0016491~oxidoreductase activity 4 0.045166591
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment