Created
January 24, 2019 13:30
-
-
Save sgibb/54c1477d26fa8702231c447badd11776 to your computer and use it in GitHub Desktop.
MSnbase3 - multiple MSnExp backends
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("mzR") | |
library("msdata") | |
library("rhdf5") | |
library("digest") | |
.sha1 <- function(x)vapply(x, digest::sha1, NA_character_, USE.NAMES=FALSE) | |
setClass( | |
"Backend", | |
contains="VIRTUAL" | |
) | |
setMethod( | |
"show", | |
signature="Backend", | |
definition=function(object) { | |
cat("Backend:", gsub("Backend", "", class(object)[1L]), "\n") | |
} | |
) | |
setGeneric( | |
"backendSetup", | |
def=function(object, files) standardGeneric("backendSetup"), | |
valueClass="Backend" | |
) | |
setMethod( | |
"backendSetup", | |
signature="Backend", | |
definition=function(object, files) { | |
object | |
} | |
) | |
setGeneric( | |
"backendPrefill", | |
def=function(object, files) standardGeneric("backendPrefill"), | |
valueClass="Backend" | |
) | |
setMethod( | |
"backendPrefill", | |
signature="Backend", | |
definition=function(object, files) { | |
object | |
} | |
) | |
setGeneric( | |
"backendReadSpectra", | |
def=function(object, files, ids) standardGeneric("backendReadSpectra"), | |
valueClass="list" | |
) | |
setGeneric( | |
"backendWriteSpectra", | |
def=function(object, files, ids, spectra) standardGeneric("backendWriteSpectra"), | |
valueClass="Backend" | |
) | |
setClass( | |
"BackendMemory", | |
contains="Backend", | |
slots=c( | |
assay="list" # or environment | |
) | |
) | |
BackendMemory <- function() { new("BackendMemory") } | |
setMethod( | |
"backendSetup", | |
signature="BackendMemory", | |
definition=function(object, files) { | |
mzMl <- openMSfile(files) | |
on.exit(close(mzMl)) | |
object@assay <- vector(mode="list", length=length(mzMl)) | |
names(object@assay) <- paste(.sha1(files), seq_along(mzMl), sep="/") | |
object | |
}) | |
setMethod( | |
"backendPrefill", | |
signature="BackendMemory", | |
definition=function(object, files) { | |
object <- backendSetup(object, files) | |
mzMl <- openMSfile(files) | |
on.exit(close(mzMl)) | |
pks <- peaks(mzMl) | |
backendWriteSpectra(object, files, seq_along(pks), pks) | |
}) | |
setMethod( | |
"backendReadSpectra", | |
signature="BackendMemory", | |
definition=function(object, files, ids) { | |
nms <- paste(.sha1(files), ids, sep="/") | |
object@assay[nms] | |
}) | |
setMethod( | |
"backendWriteSpectra", | |
signature="BackendMemory", | |
definition=function(object, files, ids, spectra) { | |
nms <- paste(.sha1(files), ids, sep="/") | |
object@assay[nms] <- spectra | |
object | |
}) | |
setClass( | |
"BackendMzMl", | |
contains="Backend" | |
) | |
BackendMzMl <- function() { new("BackendMzMl") } | |
setMethod( | |
"backendReadSpectra", | |
signature="BackendMzMl", | |
definition=function(object, files, ids) { | |
mzMl <- openMSfile(files) | |
on.exit(close(mzMl)) | |
spectra <- peaks(mzMl, scan=ids) | |
names(spectra) <- paste(.sha1(files), ids, sep="/") | |
spectra | |
}) | |
setMethod( | |
"backendWriteSpectra", | |
signature="BackendMzMl", | |
definition=function(object, files, ids, spectra) { | |
stop("Not implemented yet") | |
}) | |
setClass( | |
"BackendHdf5", | |
contains="Backend", | |
slots=c( | |
path="character", | |
h5files="character" | |
) | |
) | |
BackendHdf5 <- function(path=".") { new("BackendHdf5", path=path) } | |
setMethod( | |
"backendSetup", | |
signature="BackendHdf5", | |
definition=function(object, files) { | |
object@h5files <- file.path(object@path, paste0(.sha1(files), ".h5")) | |
names(object@h5files) <- files | |
h5 <- H5Fcreate(object@h5files[files]) | |
h5createGroup(h5, .sha1(files)) | |
on.exit(H5Fclose(h5)) | |
object | |
}) | |
setMethod( | |
"backendPrefill", | |
signature="BackendHdf5", | |
definition=function(object, files) { | |
object <- backendSetup(object, files) | |
mzMl <- openMSfile(files) | |
on.exit(close(mzMl)) | |
pks <- peaks(mzMl) | |
backendWriteSpectra(object, files, seq_along(pks), pks) | |
}) | |
setMethod( | |
"backendReadSpectra", | |
signature="BackendHdf5", | |
definition=function(object, files, ids) { | |
nms <- paste(.sha1(files), ids, sep="/") | |
h5 <- H5Fopen(object@h5files[files]) | |
on.exit(H5Fclose(h5)) | |
spectra <- lapply(nms, h5read, file=h5) | |
names(spectra) <- nms | |
spectra | |
}) | |
setMethod( | |
"backendWriteSpectra", | |
signature="BackendHdf5", | |
definition=function(object, files, ids, spectra) { | |
group <- .sha1(files) | |
h5 <- H5Fopen(object@h5files[files]) | |
h5g <- H5Gopen(h5, group) | |
on.exit(H5Gclose(h5g)) | |
on.exit(H5Fclose(h5), add=TRUE) | |
for (i in seq_along(spectra)) { | |
h5write(spectra[[i]], h5g, as.character(i)) | |
} | |
object | |
}) | |
setClass( | |
"MSnExperiment", | |
slots=c( | |
backend="Backend", | |
featureData="data.frame", | |
files="character" | |
) | |
) | |
setMethod( | |
"show", | |
signature="MSnExperiment", | |
definition=function(object) { | |
cat("MSnExperiment from", basename(object@files), "\n") | |
show(object@backend) | |
} | |
) | |
setGeneric( | |
"spectra", | |
def=function(object, files, ids) standardGeneric("spectra"), | |
valueClass="list" | |
) | |
setMethod( | |
"spectra", | |
signature="MSnExperiment", | |
definition=function(object, files, ids) { | |
if (missing(files)) { | |
files <- object@files | |
} | |
if (missing(ids)) { | |
ids <- object@featureData$spectrumIndex | |
} | |
backendReadSpectra(object@backend, files=files, ids=ids) | |
} | |
) | |
setGeneric( | |
"spectra<-", | |
def=function(object, files, ids, value) standardGeneric("spectra<-"), | |
valueClass="MSnExperiment" | |
) | |
setReplaceMethod( | |
"spectra", | |
signature="MSnExperiment", | |
definition=function(object, files, ids, value) { | |
if (missing(files)) { | |
files <- object@files | |
} | |
if (missing(ids)) { | |
ids <- object@featureData$spectrumIndex | |
} | |
object@backend <- backendWriteSpectra( | |
object@backend, | |
files=files, | |
ids=ids, | |
spectra=value | |
) | |
object | |
} | |
) | |
setGeneric( | |
"switchBackend", | |
def=function(object, to) standardGeneric("switchBackend"), | |
valueClass="MSnExperiment" | |
) | |
setMethod( | |
"switchBackend", | |
signature="MSnExperiment", | |
definition=function(object, to=BackendMemory()) { | |
to <- backendSetup(to, object@files) | |
files <- object@files | |
ids <- object@featureData$spectrumIndex | |
from <- object@backend | |
object@backend <- backendWriteSpectra( | |
to, | |
files=files, | |
ids=spectrumIndex, | |
spectra=backendReadSpectra(from, files=files, ids=ids) | |
) | |
object | |
} | |
) | |
readMSData <- function(files, backend=BackendMemory()) { | |
fh <- openMSfile(files) | |
fd <- header(fh) | |
fd$spectrumIndex <- seq_len(nrow(fd)) | |
close(fh) | |
new("MSnExperiment", | |
backend=backendPrefill(backend, files), | |
files=files, | |
featureData=fd | |
) | |
} | |
ms1 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE)) | |
ms2 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE), backend=BackendMzMl()) | |
ms3 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE), backend=BackendHdf5(path="..")) | |
all.equal(spectra(ms1, ids=1:3), spectra(ms2, ids=1:3)) | |
# [1] TRUE | |
all.equal(spectra(ms2, ids=1:3), spectra(ms3, ids=1:3)) | |
# [1] TRUE | |
spectra(ms1, ids=1:2) <- list(cbind(1:3, 4:6), cbind(11:13, 14:16)) | |
spectra(ms1, ids=3:509) <- NULL | |
ms1@featureData <- ms1@featureData[1:2,] | |
spectra(ms1) | |
# $`7ca27022eb61d649783dda21812088293d35f3a6/1` | |
# [,1] [,2] | |
# [1,] 1 4 | |
# [2,] 2 5 | |
# [3,] 3 6 | |
# | |
# $`7ca27022eb61d649783dda21812088293d35f3a6/2` | |
# [,1] [,2] | |
# [1,] 11 14 | |
# [2,] 12 15 | |
# [3,] 13 16 | |
# | |
ms4 <- switchBackend(ms1, BackendHdf5(path=".")) | |
all.equal(spectra(ms1), spectra(ms4)) | |
# [1] TRUE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice indeed. Some suggestions though:
backendInitialize
(reads data from input and usesbackendUpdate
: writes changes to files (hdf5 backend), updates theSpectrumList
(in-mem backend), does nothing (mzR backend).backendSpectrapply
: same functionality thanspectrapply,OnDiskMSnExp
(or betterspectrapply,Hdf5MSnExp
): gets all dataSpectrum
objects as they are (eventually just remove the extensions of theVersioned
class - IMHO that's just some legacy class that not even Bioconductor developers use anymore. Also, I would lighten thevalidObject
method, eventually implemeting a downstream functionvalidate<class name>
that has an additional argument to switch between a thourough check and a fast check (the latter used forvalidObject
.spectra
method returning just a list of matrices. For theOnDiskMSnExp
-alikemzR
backend this would be too slow, because I have to first load the data, create the spectra, apply the lazy evaluation queue and would then have to extract again the data as a matrix from the spectra... not ideal I think. I would by default return aSpectrumList
.OnDiskMSnExp
has and write things to files isbackupUpdate
is called.Why a
backendSpectrapply
? For large experiments I don't want to read all data into memory, so I need to have a function that reads spectra per file and applies a function (such as extract chromatogram, base peak intensity etc). I found this function really convenient forOnDiskMSnExp
- we could use that as prototype.Also, why not create a new branch on
MSnbase
? I would be more than happy if I could contribute here (especially add some of the hdf5 and the mzR backend functionality).