Skip to content

Instantly share code, notes, and snippets.

@alexllc
Last active November 13, 2020 10:58
Show Gist options
  • Save alexllc/b2d362ec3e2411baacb070c19df14130 to your computer and use it in GitHub Desktop.
Save alexllc/b2d362ec3e2411baacb070c19df14130 to your computer and use it in GitHub Desktop.
#' filter_replicate_samples
#'
#' Function to filter technical replicates in TCGA samples
#' Doc: https://gist.github.com/alexllc/8dcd229ed3ad7f069e92dc30d5eac83a
#'
#' @source \url{http://gdac.broadinstitute.org/runs/stddata__2014_01_15/samples_report/READ_Replicate_Samples.html}
#' @param bcr list of barcodes
#' @param verbose print out which barcodes are kept and which ones are filtered
#'
#' @return subset of the list of barcodes
#'
#' @examples
#' download.file("http://gdac.broadinstitute.org/runs/stddata__2014_01_15/samples_report/filteredSamples.2014_01_15__00_00_11.txt", "samples_filter.txt")
#'
#' test = read.table("samples_filter.txt", sep = "\t", header = T)
#' commas_bcr = unlist(strsplit(test_bcr[grepl(",", test_bcr)], ","))
#' test_bcr = c(test_bcr[!grepl(",", test_bcr)], commas_bcr, test$Chosen.Sample)
#' test_dna = test_bcr[grepl(".{19}[RHT]", test_bcr)]
#' test_rna = test_bcr[grepl(".{19}[DGWX]", test_bcr)]
#' filter_test_dna = filter_replicate_samples(test_dna, verbose = F)
#' filter_test_rna = filter_replicate_samples(test_rna, verbose = F)
#' if (all(filter_test_dna %in% test$Chosen.Sample) & all(filter_test_rna %in% test$Chosen.Sample)) { message("Filter passed.")
#' } else {message("Filter failed.")}
#'
library(dplyr)
filter_replicate_samples <- function(bcr, verbose = TRUE) {
if ( all(grepl(".{19}[RHT]", bcr)) ) {
type = "RNA"
} else if (all (grepl(".{19}[DGWX]", bcr))) {
type = "DNA"
} else {
stop("Mixing RNA and DNA samples not allowed.")
}
bcr_df = cbind(separate(as.data.frame(bcr),
"bcr",
c("project", "TSS", "patient", "sample,vial", "portion,analyte", "plate", "center"), # skip var with NAs
sep = "-"),
bcr)
bcr_df = separate(bcr_df, col = "sample,vial", into = c("sample", "vial"), sep = 2)
bcr_df = separate(bcr_df, col = "portion,analyte", into = c("portion", "analyte"), sep = 2)
# Selecting one aliquot based on GDC's Analyte Replicate Filter and Sort Replicate Filter rules
bcr_df = bcr_df %>% arrange(analyte, desc(plate), desc(bcr)) %>% group_by(TSS, patient, sample) %>% slice(1)
out_tbl = data.frame()
dup_samples = unique(substr(bcr[!(bcr %in% bcr_df$bcr)], 1, 15))
for (dupbcr in dup_samples) {
kept = as.character(bcr_df[grep(dupbcr, bcr_df$bcr),10])
removed = bcr[grep(dupbcr, bcr)]
removed = paste(removed[!(removed %in% kept)], collapse = ",")
out_tbl = rbind(out_tbl, c(kept, removed))
}
colnames(out_tbl) = c("chosen", "removed")
if (verbose) {
print("The following changes are made: " )
print(out_tbl)
}
return(bcr_df$bcr)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment