## ----install, eval=FALSE------------------------------------------------------
# if (!require("BiocManager", quietly=TRUE))
# install.packages("BiocManager")
#
# pkgs <- c("HDF5Array", "ExperimentHub", "DelayedMatrixStats", "RSpectra")
# BiocManager::install(pkgs)
## ----load, message=FALSE------------------------------------------------------
library(HDF5Array)
library(ExperimentHub)
library(DelayedMatrixStats)
library(RSpectra)
## ----source_make_timings_table_R, echo=FALSE, results='hide'------------------
## Needed for the make_timings_table() function.
path <- system.file(package="HDF5Array",
"scripts", "make_timings_table.R", mustWork=TRUE)
source(path, verbose=FALSE)
## ----ExperimentHub------------------------------------------------------------
hub <- ExperimentHub()
hub["EH1039"]$description # sparse representation
hub["EH1040"]$description # dense representation
## ----get_EH1039_and_EH1040, message=FALSE-------------------------------------
## Note that this will be quick if the HDF5 files are already in the
## local ExperimentHub cache. Otherwise, it will take a while!
brain_s_path <- hub[["EH1039"]]
brain_D_path <- hub[["EH1040"]]
## ----brain_s------------------------------------------------------------------
## Use 'h5ls(brain_s_path)' to find out the group.
brain_s <- TENxMatrix(brain_s_path, group="mm10")
## ----brain_s_class_and_dim----------------------------------------------------
class(brain_s)
dim(brain_s)
is_sparse(brain_s)
## ----brain_D------------------------------------------------------------------
## Use 'h5ls(brain_D_path)' to find out the name of the dataset.
brain_D <- HDF5Array(brain_D_path, name="counts")
## ----brain_D_class_and_dim----------------------------------------------------
class(brain_D)
dim(brain_D)
chunkdim(brain_D)
is_sparse(brain_D)
## ----brain_Ds-----------------------------------------------------------------
brain_Ds <- HDF5Array(brain_D_path, name="counts", as.sparse=TRUE)
## ----brain_Ds_class_and_dim---------------------------------------------------
class(brain_Ds)
dim(brain_Ds)
chunkdim(brain_Ds)
is_sparse(brain_Ds)
## ----set_brain_D_and_brain_Ds_dimnames----------------------------------------
dimnames(brain_Ds) <- dimnames(brain_D) <- dimnames(brain_s)
## ----create_test_datasets-----------------------------------------------------
brain1_s <- brain_s[ , 1:12500]
brain1_D <- brain_D[ , 1:12500]
brain1_Ds <- brain_Ds[ , 1:12500]
brain2_s <- brain_s[ , 1:25000]
brain2_D <- brain_D[ , 1:25000]
brain2_Ds <- brain_Ds[ , 1:25000]
brain3_s <- brain_s[ , 1:50000]
brain3_D <- brain_D[ , 1:50000]
brain3_Ds <- brain_Ds[ , 1:50000]
brain4_s <- brain_s[ , 1:100000]
brain4_D <- brain_D[ , 1:100000]
brain4_Ds <- brain_Ds[ , 1:100000]
brain5_s <- brain_s[ , 1:200000]
brain5_D <- brain_D[ , 1:200000]
brain5_Ds <- brain_Ds[ , 1:200000]
## ----simple_normalize_function------------------------------------------------
## Also selects the most variable genes (1000 by default).
simple_normalize <- function(mat, num_var_genes=1000)
{
stopifnot(length(dim(mat)) == 2, !is.null(rownames(mat)))
mat <- mat[rowSums(mat) > 0, ]
col_sums <- colSums(mat) / 10000
mat <- t(t(mat) / col_sums)
row_vars <- rowVars(mat)
row_vars_order <- order(row_vars, decreasing=TRUE)
variable_idx <- head(row_vars_order, n=num_var_genes)
mat <- log1p(mat[variable_idx, ])
mat / rowSds(mat)
}
## ----simple_PCA_function------------------------------------------------------
simple_PCA <- function(mat, k=25)
{
stopifnot(length(dim(mat)) == 2)
row_means <- rowMeans(mat)
Ax <- function(x, args)
(as.numeric(mat %*% x) - row_means * sum(x))
Atx <- function(x, args)
(as.numeric(x %*% mat) - as.vector(row_means %*% x))
RSpectra::svds(Ax, Atrans=Atx, k=k, dim=dim(mat))
}
## ----norm_brain1_s------------------------------------------------------------
dim(brain1_s)
DelayedArray::setAutoBlockSize(250e6) # set block size to 250 Mb
system.time(norm_brain1_s <- simple_normalize(brain1_s))
dim(norm_brain1_s)
## ----norm_brain1_D------------------------------------------------------------
dim(brain1_D)
DelayedArray::setAutoBlockSize(16e6) # set block size to 16 Mb
system.time(norm_brain1_D <- simple_normalize(brain1_D))
dim(norm_brain1_D)
## ----norm_brain1_Ds-----------------------------------------------------------
dim(brain1_Ds)
DelayedArray::setAutoBlockSize(250e6) # set block size to 250 Mb
system.time(norm_brain1_Ds <- simple_normalize(brain1_Ds))
dim(norm_brain1_Ds)
## ----show_norm_brain1_s_delayed_ops-------------------------------------------
class(norm_brain1_s)
showtree(norm_brain1_s)
## ----set_realization_block_size-----------------------------------------------
DelayedArray::setAutoBlockSize(1e8)
## ----writeTENxMatrix_norm_brain1_s--------------------------------------------
dim(norm_brain1_s)
system.time(norm_brain1_s <- writeTENxMatrix(norm_brain1_s, level=0))
## ----show_pristine_norm_brain1_s----------------------------------------------
class(norm_brain1_s)
showtree(norm_brain1_s) # "pristine" object (i.e. no more delayed operations)
## ----writeHDF5Array_norm_brain1_D---------------------------------------------
dim(norm_brain1_D)
system.time(norm_brain1_D <- writeHDF5Array(norm_brain1_D, level=0))
## ----show_pristine_norm_brain1_D----------------------------------------------
class(norm_brain1_D)
showtree(norm_brain1_D) # "pristine" object (i.e. no more delayed operations)
## ----writeHDF5Array_norm_brain1_Ds--------------------------------------------
dim(norm_brain1_Ds)
system.time(norm_brain1_Ds <- writeHDF5Array(norm_brain1_Ds, level=0))
## ----show_pristine_norm_brain1_Ds---------------------------------------------
class(norm_brain1_Ds)
showtree(norm_brain1_Ds) # "pristine" object (i.e. no more delayed operations)
## ----PCA_norm_brain1_s--------------------------------------------------------
DelayedArray::setAutoBlockSize(40e6) # set block size to 40 Mb
dim(norm_brain1_s)
system.time(pca1s <- simple_PCA(norm_brain1_s))
## ----PCA_norm_brain1_D--------------------------------------------------------
DelayedArray::setAutoBlockSize(1e8) # set block size to 100 Mb
dim(norm_brain1_D)
system.time(pca1D <- simple_PCA(norm_brain1_D))
## ----sanity_check1d-----------------------------------------------------------
stopifnot(all.equal(pca1D, pca1s))
## ----PCA_norm_brain1_Ds-------------------------------------------------------
DelayedArray::setAutoBlockSize(40e6) # set block size to 40 Mb
dim(norm_brain1_Ds)
system.time(pca1Ds <- simple_PCA(norm_brain1_Ds))
## ----sanity_check1ds----------------------------------------------------------
stopifnot(all.equal(pca1Ds, pca1s))
## ----xps15_specs, echo=FALSE, results='asis'----------------------------------
hdparm1 <- "Output of sudo hdparm -Tt <device>
:"
hdparm1 <- sprintf("%s", hdparm1)
hdparm2 <- c(
"Timing cached reads: 35188 MB in 2.00 seconds = 17620.75 MB/sec",
"Timing buffered disk reads: 7842 MB in 3.00 seconds = 2613.57 MB/sec"
)
hdparm2 <- sprintf("%s
", paste(hdparm2, collapse="
"))
disk_perf <- paste0(hdparm1, "
", hdparm2)
make_machine_specs_table("Specs for DELL XPS 15 laptop (model 9520)",
specs=c(`OS`="Linux Ubuntu 24.04",
`RAM`="32GB",
`Disk`="1TB SSD"),
disk_perf=disk_perf)
## ----xps15_timings, echo=FALSE, results='asis'--------------------------------
caption <- "Table 1: Timings for DELL XPS 15 laptop"
make_timings_table("xps15", caption=caption)
## ----rex3_specs, echo=FALSE, results='asis'-----------------------------------
hdparm1 <- "Output of sudo hdparm -Tt <device>
:"
hdparm1 <- sprintf("%s", hdparm1)
hdparm2 <- c(
"Timing cached reads: 20592 MB in 1.99 seconds = 10361.41 MB/sec",
"Timing buffered disk reads: 1440 MB in 3.00 seconds = 479.66 MB/sec"
)
hdparm2 <- sprintf("%s
", paste(hdparm2, collapse="
"))
disk_perf <- paste0(hdparm1, "
", hdparm2)
make_machine_specs_table("Specs for Supermicro SuperServer 1029GQ-TRT",
specs=c(`OS`="Linux Ubuntu 22.04",
`RAM`="384GB",
`Disk`="1.3TB ATA Disk"),
disk_perf=disk_perf)
## ----rex3_timings, echo=FALSE, results='asis'---------------------------------
caption <- "Table 2: Timings for Supermicro SuperServer 1029GQ-TRT"
make_timings_table("rex3", caption=caption)
## ----kjohnson3_specs, echo=FALSE, results='asis'------------------------------
make_machine_specs_table("Specs for Apple Silicon Mac Pro (Apple M2 Ultra)",
specs=c(`OS`="macOS 13.7.1",
`RAM`="192GB",
`Disk`="2TB SSD"),
disk_perf="N/A")
## ----kjohnson3_timings, echo=FALSE, results='asis'----------------------------
caption <- "Table 3: Timings for Apple Silicon Mac Pro"
make_timings_table("kjohnson3", caption=caption)
## ----lconway_specs, echo=FALSE, results='asis'--------------------------------
make_machine_specs_table("Specs for Intel Mac Pro (24-Core Intel Xeon W)",
specs=c(`OS`="macOS 12.7.6",
`RAM`="96GB",
`Disk`="1TB SSD"),
disk_perf="N/A")
## ----lconway_timings, echo=FALSE, results='asis'------------------------------
caption <- "Table 4: Timings for Intel Mac Pro"
make_timings_table("lconway", caption=caption)
## ----summarize_machine_times, echo=FALSE, results='asis'----------------------
machine_names <- c(
`DELL XPS 15 laptop`="xps15",
`Supermicro SuperServer 1029GQ-TRT`="rex3",
`Apple Silicon Mac Pro`="kjohnson3",
`Intel Mac Pro`="lconway"
)
summarize_machine_times(machine_names)
## ----sessionInfo--------------------------------------------------------------
sessionInfo()