Last active
November 13, 2020 10:58
-
-
Save alexllc/b2d362ec3e2411baacb070c19df14130 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' filter_replicate_samples | |
#' | |
#' Function to filter technical replicates in TCGA samples | |
#' Doc: https://gist.github.com/alexllc/8dcd229ed3ad7f069e92dc30d5eac83a | |
#' | |
#' @source \url{http://gdac.broadinstitute.org/runs/stddata__2014_01_15/samples_report/READ_Replicate_Samples.html} | |
#' @param bcr list of barcodes | |
#' @param verbose print out which barcodes are kept and which ones are filtered | |
#' | |
#' @return subset of the list of barcodes | |
#' | |
#' @examples | |
#' download.file("http://gdac.broadinstitute.org/runs/stddata__2014_01_15/samples_report/filteredSamples.2014_01_15__00_00_11.txt", "samples_filter.txt") | |
#' | |
#' test = read.table("samples_filter.txt", sep = "\t", header = T) | |
#' commas_bcr = unlist(strsplit(test_bcr[grepl(",", test_bcr)], ",")) | |
#' test_bcr = c(test_bcr[!grepl(",", test_bcr)], commas_bcr, test$Chosen.Sample) | |
#' test_dna = test_bcr[grepl(".{19}[RHT]", test_bcr)] | |
#' test_rna = test_bcr[grepl(".{19}[DGWX]", test_bcr)] | |
#' filter_test_dna = filter_replicate_samples(test_dna, verbose = F) | |
#' filter_test_rna = filter_replicate_samples(test_rna, verbose = F) | |
#' if (all(filter_test_dna %in% test$Chosen.Sample) & all(filter_test_rna %in% test$Chosen.Sample)) { message("Filter passed.") | |
#' } else {message("Filter failed.")} | |
#' | |
library(dplyr) | |
filter_replicate_samples <- function(bcr, verbose = TRUE) { | |
if ( all(grepl(".{19}[RHT]", bcr)) ) { | |
type = "RNA" | |
} else if (all (grepl(".{19}[DGWX]", bcr))) { | |
type = "DNA" | |
} else { | |
stop("Mixing RNA and DNA samples not allowed.") | |
} | |
bcr_df = cbind(separate(as.data.frame(bcr), | |
"bcr", | |
c("project", "TSS", "patient", "sample,vial", "portion,analyte", "plate", "center"), # skip var with NAs | |
sep = "-"), | |
bcr) | |
bcr_df = separate(bcr_df, col = "sample,vial", into = c("sample", "vial"), sep = 2) | |
bcr_df = separate(bcr_df, col = "portion,analyte", into = c("portion", "analyte"), sep = 2) | |
# Selecting one aliquot based on GDC's Analyte Replicate Filter and Sort Replicate Filter rules | |
bcr_df = bcr_df %>% arrange(analyte, desc(plate), desc(bcr)) %>% group_by(TSS, patient, sample) %>% slice(1) | |
out_tbl = data.frame() | |
dup_samples = unique(substr(bcr[!(bcr %in% bcr_df$bcr)], 1, 15)) | |
for (dupbcr in dup_samples) { | |
kept = as.character(bcr_df[grep(dupbcr, bcr_df$bcr),10]) | |
removed = bcr[grep(dupbcr, bcr)] | |
removed = paste(removed[!(removed %in% kept)], collapse = ",") | |
out_tbl = rbind(out_tbl, c(kept, removed)) | |
} | |
colnames(out_tbl) = c("chosen", "removed") | |
if (verbose) { | |
print("The following changes are made: " ) | |
print(out_tbl) | |
} | |
return(bcr_df$bcr) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment