mdozmorov · September 19, 2022 23:39 · kcoutinh · Feb 16, 2024
diff --git a/gist_mm39_excluderanges.R b/gist_mm39_excluderanges.R
 # Download a list of problematic regions (aka blacklist) for the GRCm39/mm39 
 # mouse genome assembly. Defined by the Boyle-Lab/Blacklist 
 # software, High Signal and Low Mappability regions. 
 # See https://github.com/dozmorovlab/excluderanges for more information.

 suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
 suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
 # bedbase_id
 bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
 # Construct output file name
 fileNameOut <- "mm39.excluderanges.bed.gz"
 # API token for BED data
 token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
 # Download file
 GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
 # Read the data in
 mm39.excluderanges <- readr::read_tsv(fileNameOut, 
                                     col_names = FALSE,
                                     col_types = c("cddcdc"))
 # Assign column names depending on the number of columns
 all_columns <- c("chr", "start", "end", "name", "score", "strand",
                 "signalValue", "pValue", "qValue", "peak")
 colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
 # Convert to GRanges object
 mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges, 
                                              keep.extra.columns = TRUE)
 # Seqinfo for mm39 genome
 chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39", 
                                                 assembled.molecules.only = TRUE)
 # Subset and match to chromosomes in the mm39.excluderanges object
 # Common chromosomes
 chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
 # Subset mm39.excluderanges
 mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common, 
                                    pruning.mode = "tidy")      
 # Subset chrom_data
 chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
 # Match objects
 chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
 # Assign seqinfo data
 seqlengths(mm39.excluderanges) <- chrom_data$size
 isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
 genome(mm39.excluderanges)     <- "mm39"

 mm39.excluderanges
	# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
	# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
	# Construct output file name
	fileNameOut <- "mm39.excluderanges.bed.gz"
	# API token for BED data
	token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
	# Download file
	GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
	# Read the data in
	mm39.excluderanges <- readr::read_tsv(fileNameOut,
	col_names = FALSE,
	col_types = c("cddcdc"))
	# Assign column names depending on the number of columns
	all_columns <- c("chr", "start", "end", "name", "score", "strand",
	"signalValue", "pValue", "qValue", "peak")
	colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
	# Convert to GRanges object
	mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
	keep.extra.columns = TRUE)
	# Seqinfo for mm39 genome
	chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
	assembled.molecules.only = TRUE)
	# Subset and match to chromosomes in the mm39.excluderanges object
	# Common chromosomes
	chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
	# Subset mm39.excluderanges
	mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
	pruning.mode = "tidy")
	# Subset chrom_data
	chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
	# Match objects
	chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
	# Assign seqinfo data
	seqlengths(mm39.excluderanges) <- chrom_data$size
	isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
	genome(mm39.excluderanges) <- "mm39"

	mm39.excluderanges