jnpaulson · February 1, 2021 06:57
diff --git a/gtex_de.R b/gtex_de.R
 # Actually perform the differential expression analysis
 library(yarn)
 library(dplyr)
 library(readr)
 library(biomaRt)
 library(limma)
 source("voomweights.R")
 obj <- readRDS("~/Desktop/gtex_v6p_lung_blood_norm.rds")
 gender = pData(obj)$GENDER
 batch  = factor(as.character(pData(obj)$SMNABTCHT))
 tissue <- factor(pData(obj)$SMTSD,levels=c("Whole Blood","Lung"))
 design = model.matrix(~tissue+batch+gender)
 transformedCounts = assayData(obj)[["normalizedMatrix"]]

 voomOutput = voomWeightsCustomized(transformedCounts,design)
 fit = lmFit(voomOutput,design)
 fit = eBayes(fit)
 gl  = fData(obj)$hgnc_symbol
 tt  = topTable(fit,number=Inf,genelist=gl,coef="tissueLung")
 saveRDS(list(fit,tt),file="~/Desktop/results.rdata")
diff --git a/gtex_download.R b/gtex_download.R
 # Download and prepare the datafile
 library(yarn)
 library(dplyr)
 library(readr)
 library(biomaRt)
 library(limma)

 cnts <- suppressWarnings(read_delim("https://storage.googleapis.com/gtex_analysis_v6p/rna_seq_data/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct.gz",delim='\t',skip=2))
 genes <- unlist(cnts[, 1])
 geneNames <- unlist(cnts[, 2])
 counts <- cnts[, -c(1:2)]
 counts <- as.matrix(counts)
 rownames(counts) <- genes
 throwAway <- which(rowSums(counts) == 0)
 counts <- counts[-throwAway, ]
 es <- ExpressionSet(as.matrix(counts))

 phenoFile <- "https://storage.googleapis.com/gtex_analysis_v6p/annotations/GTEx_Data_V6_Annotations_SampleAttributesDS.txt"
 pd <- suppressWarnings(read_tsv(phenoFile)) %>% as.data.frame
 rownames(pd) <- pd[, "SAMPID"]
 ids <- sapply(strsplit(pd[, "SAMPID"], "-"), function(i) paste(i[1:2], 
        collapse = "-"))
 pd <- cbind(pd,SUBJID = ids)
 pheno2File<- "https://storage.googleapis.com/gtex_analysis_v6p/annotations/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt"
 pd2 <- suppressWarnings(read_tsv(pheno2File)) %>% as.data.frame

 pdfinal <- left_join(pd,pd2)
 pdfinal <- pdfinal[match(colnames(counts),pdfinal[["SAMPID"]]),]
 phenoData(es) <- AnnotatedDataFrame(pdfinal)


 genes <- sub("\\..*", "", rownames(counts))
 ensembl = useEnsembl(biomart="ensembl",GRCh=37, dataset="hsapiens_gene_ensembl")
 attributes <- c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", 
            "start_position", "end_position", "gene_biotype")
 esFinal <- annotateFromBiomart(obj = es, genes = genes,attributes=attributes)
 saveRDS(esFinal, file = "~/Desktop/gtex_v6p.rds")
diff --git a/gtex_normalize.R b/gtex_normalize.R
 # Filter to solely whole blood and lung samples 
 # We checked and this is the 'only' USEFRZ cohort
 library(yarn)
 library(dplyr)
 library(readr)
 library(biomaRt)
 library(limma)


 obj <- readRDS("~/Desktop/gtex_v6p.rds")
 obj <- filterSamples(esFinal,c("Whole Blood","Lung"),
 	groups='SMTSD',keepOnly=TRUE)
 obj = filterLowGenes(obj,"SMTSD")
 obj <- filterGenes(obj)
 ### Normalize using qsmooth
 obj = normalizeTissueAware(obj,"SMTSD")

 saveRDS(obj,file="~/Desktop/gtex_v6p_lung_blood_norm.rds")

diff --git a/voomweights.R b/voomweights.R
 # customized voom weights for already transformed values

 voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
  if (is.null(design)) {
    design <- model.matrix(~ 1 )
  }
  out<-list()

  if (is.cpm == TRUE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.cpm <- 2^fitted.values
    fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
    fitted.logcount <- log2(fitted.count)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }

  if (is.cpm == FALSE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean 
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.logcount <- log2(fitted.values)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }
  out$E = log2counts
  out$weights = w
  out$design = design
  new("EList",out)
 }
	# Actually perform the differential expression analysis
	library(yarn)
	library(dplyr)
	library(readr)
	library(biomaRt)
	library(limma)
	source("voomweights.R")
	obj <- readRDS("~/Desktop/gtex_v6p_lung_blood_norm.rds")
	gender = pData(obj)$GENDER
	batch = factor(as.character(pData(obj)$SMNABTCHT))
	tissue <- factor(pData(obj)$SMTSD,levels=c("Whole Blood","Lung"))
	design = model.matrix(~tissue+batch+gender)
	transformedCounts = assayData(obj)[["normalizedMatrix"]]

	voomOutput = voomWeightsCustomized(transformedCounts,design)
	fit = lmFit(voomOutput,design)
	fit = eBayes(fit)
	gl = fData(obj)$hgnc_symbol
	tt = topTable(fit,number=Inf,genelist=gl,coef="tissueLung")
	saveRDS(list(fit,tt),file="~/Desktop/results.rdata")
	# Download and prepare the datafile
	library(yarn)
	library(dplyr)
	library(readr)
	library(biomaRt)
	library(limma)

	cnts <- suppressWarnings(read_delim("https://storage.googleapis.com/gtex_analysis_v6p/rna_seq_data/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct.gz",delim='\t',skip=2))
	genes <- unlist(cnts[, 1])
	geneNames <- unlist(cnts[, 2])
	counts <- cnts[, -c(1:2)]
	counts <- as.matrix(counts)
	rownames(counts) <- genes
	throwAway <- which(rowSums(counts) == 0)
	counts <- counts[-throwAway, ]
	es <- ExpressionSet(as.matrix(counts))

	phenoFile <- "https://storage.googleapis.com/gtex_analysis_v6p/annotations/GTEx_Data_V6_Annotations_SampleAttributesDS.txt"
	pd <- suppressWarnings(read_tsv(phenoFile)) %>% as.data.frame
	rownames(pd) <- pd[, "SAMPID"]
	ids <- sapply(strsplit(pd[, "SAMPID"], "-"), function(i) paste(i[1:2],
	collapse = "-"))
	pd <- cbind(pd,SUBJID = ids)
	pheno2File<- "https://storage.googleapis.com/gtex_analysis_v6p/annotations/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt"
	pd2 <- suppressWarnings(read_tsv(pheno2File)) %>% as.data.frame

	pdfinal <- left_join(pd,pd2)
	pdfinal <- pdfinal[match(colnames(counts),pdfinal[["SAMPID"]]),]
	phenoData(es) <- AnnotatedDataFrame(pdfinal)


	genes <- sub("\\..*", "", rownames(counts))
	ensembl = useEnsembl(biomart="ensembl",GRCh=37, dataset="hsapiens_gene_ensembl")
	attributes <- c("ensembl_gene_id", "hgnc_symbol", "chromosome_name",
	"start_position", "end_position", "gene_biotype")
	esFinal <- annotateFromBiomart(obj = es, genes = genes,attributes=attributes)
	saveRDS(esFinal, file = "~/Desktop/gtex_v6p.rds")
	# Filter to solely whole blood and lung samples
	# We checked and this is the 'only' USEFRZ cohort
	library(yarn)
	library(dplyr)
	library(readr)
	library(biomaRt)
	library(limma)


	obj <- readRDS("~/Desktop/gtex_v6p.rds")
	obj <- filterSamples(esFinal,c("Whole Blood","Lung"),
	groups='SMTSD',keepOnly=TRUE)
	obj = filterLowGenes(obj,"SMTSD")
	obj <- filterGenes(obj)
	### Normalize using qsmooth
	obj = normalizeTissueAware(obj,"SMTSD")

	saveRDS(obj,file="~/Desktop/gtex_v6p_lung_blood_norm.rds")
	# customized voom weights for already transformed values

	voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
	if (is.null(design)) {
	design <- model.matrix(~ 1 )
	}
	out<-list()

	if (is.cpm == TRUE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.cpm <- 2^fitted.values
	fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
	fitted.logcount <- log2(fitted.count)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}

	if (is.cpm == FALSE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.logcount <- log2(fitted.values)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}
	out$E = log2counts
	out$weights = w
	out$design = design
	new("EList",out)
	}