This vignette endeavors to put Bioconductor and scvi-tools together to help understand how different data structures and methods relevant to CITE-seq analysis contribute to interpretation of CITE-seq exeperiments.
The scvi-tools tutorial (for version 0.20.0) analyzes a pair of 10x PBMC CITE-seq experiments (5k and 10k cells). Chapter 12 of the OSCA book analyzes only the 10k dataset.
The following subsections are essentially “code-only”. We exhibit steps necessary to assemble a SingleCellExperiment instance with the subset of the totalVI quantifications produced for the cells from the “10k” dataset.
library(SingleCellExperiment)
library(scater)
library(scviR)
getCh12Sce(clear_cache=FALSE)
ch12sce = ch12sce
## class: SingleCellExperiment
## dim: 33538 7472
## metadata(2): Samples se.averaged
## assays(2): counts logcounts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
## ENSG00000268674
## rowData names(3): ID Symbol Type
## colnames(7472): AAACCCAAGATTGTGA-1 AAACCCACATCGGTTA-1 ...
## TTTGTTGTCGAGTGAG-1 TTTGTTGTCGTTCAGA-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## mainExpName: Gene Expression
## altExpNames(1): Antibody Capture
getTotalVI5k10kAdata()
fullvi = fullvi
## AnnData object with n_obs × n_vars = 10849 × 4000
## obs: 'n_genes', 'percent_mito', 'n_counts', 'batch', '_scvi_labels', '_scvi_batch', 'leiden_totalVI'
## var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
## uns: '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'leiden', 'log1p', 'neighbors', 'umap'
## obsm: 'X_totalVI', 'X_umap', 'denoised_protein', 'protein_expression', 'protein_foreground_prob'
## layers: 'counts', 'denoised_rna'
## obsp: 'connectivities', 'distances'
rownames(fullvi$obs)
totvi_cellids = fullvi$obs$batch totvi_batch =
fullvi$obsm$get("X_totalVI")
totvi_latent = fullvi$obsm$get("X_umap")
totvi_umap = fullvi$layers$get("denoised_rna")
totvi_denoised_rna = fullvi$obsm$get("denoised_protein")
totvi_denoised_protein = fullvi$obs$leiden_totalVI totvi_leiden =
which(totvi_batch == "PBMC5k")
is5k = totvi_cellids[-is5k]
totvi_cellids = totvi_latent[-is5k,]
totvi_latent = totvi_umap[-is5k,]
totvi_umap = totvi_denoised_rna[-is5k,]
totvi_denoised_rna = totvi_denoised_protein[-is5k,]
totvi_denoised_protein = totvi_leiden[-is5k] totvi_leiden =
rownames(totvi_latent) = totvi_cellids
rownames(totvi_umap) = totvi_cellids
rownames(totvi_denoised_rna) = totvi_cellids
rownames(totvi_denoised_protein) = totvi_cellids
names(totvi_leiden) = totvi_cellids
In this section we reduce the cell collections to cells common to the chapter 12 and totalVI datasets.
intersect(totvi_cellids, ch12sce$Barcode) comm =
# select and order
totvi_latent[comm,]
totvi_latent = totvi_umap[comm,]
totvi_umap = totvi_denoised_rna[comm,]
totvi_denoised_rna = totvi_denoised_protein[comm,]
totvi_denoised_protein = totvi_leiden[comm]
totvi_leiden =
# organize the totalVI into SCE with altExp
SingleCellExperiment(SimpleList(logcounts=t(totvi_denoised_rna))) # FALSE name
totsce =rowData(totsce) = S4Vectors::DataFrame(fullvi$var)
rownames(totsce) = rownames(fullvi$var)
rowData(totsce)$Symbol = rownames(totsce)
SingleCellExperiment(SimpleList(logcounts=t(totvi_denoised_protein))) # FALSE name
nn =reducedDims(nn) = list(UMAP=totvi_umap)
altExp(totsce) = nn
altExpNames(totsce) = "denoised_protein"
$leiden = totvi_leiden
totscealtExp(totsce)$leiden = totvi_leiden
altExp(totsce)$ch12.clusters = altExp(ch12sce[,comm])$label
# add average ADT abundance to metadata, for adt_profiles
sumCountsAcrossCells(altExp(totsce), altExp(totsce)$leiden,
tot.se.averaged <-exprs_values="logcounts", average=TRUE)
rownames(tot.se.averaged) = gsub("_TotalSeqB", "", rownames(tot.se.averaged))
metadata(totsce)$se.averaged = tot.se.averaged
colnames(ch12sce) = ch12sce$Barcode
ch12sce[, comm] ch12sce_matched =
The TSNE projection of the normalized ADT quantifications and the walktrap cluster assignments are produced for the cells common to the two approaches.
plotTSNE(altExp(ch12sce_matched), color_by="label", text_by="label")
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
This heatmap uses precomputed cluster averages that are lodged in the metadata element of the SingleCellExperiment. Colors represent the log2-fold change from the grand average across all clusters.
adtProfiles(ch12sce_matched)
We enhance the annotation of the list of subclusters retrieved
using getCh12AllSce
and then drill into mRNA-based
subclusters of ADT-based cluster 3 to compare expression levels
of three genes.
getCh12AllSce()
ch12_allsce = lapply(ch12_allsce, function(x) {
ch12_allsce =colnames(x)= x$Barcode;
colnames(x);
cn = x[,intersect(cn,comm)]; x})
x = "3"
of.interest <- c("GZMH", "IL7R", "KLRB1")
markers <-plotExpression(ch12_allsce[[of.interest]], x="subcluster",
features=markers, swap_rownames="Symbol", ncol=3)
There is a suggestion of a boolean basis for subcluster identity, depending on low or high expression of the selected genes.
Following the exploration in OSCA chapter 12, cluster 3 is analyzed for a regression association between expression measures of three genes and the ADT-based abundance of CD127.
plotExpression(ch12_allsce[["3"]], x="CD127", show_smooth=TRUE, show_se=FALSE,
features=c("IL7R", "TPT1", "KLRB1", "GZMH"), swap_rownames="Symbol")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plotUMAP(altExp(totsce), color_by="leiden", text_by="leiden")
The approach to profiling the ADT abundances used in the totalVI tutorial employs scaling to (0,1).
S4Vectors::metadata(totsce)$se.averaged
tav = assay(tav)
ata = function(x) (x-min(x))/max(x)
uscale = t(apply(ata,1,uscale))
scmat =::pheatmap(scmat, cluster_rows=FALSE) pheatmap
A quick view of the concordance of the two clustering outcomes is
altExp(totsce)
atot = altExp(ch12sce_matched)
ach12 = table(ch12=ach12$label, VI=atot$leiden)
tt =::pheatmap(log(tt+1)) pheatmap
With this we can pick out some clusters with many cells in common:
tt[c("9", "12", "5", "3"), c("0", "1", "2", "8", "6", "5")]
lit =rownames(lit) = sQuote(rownames(lit))
colnames(lit) = sQuote(colnames(lit))
lit
## VI
## ch12 '0' '1' '2' '8' '6' '5'
## '9' 1334 0 0 0 0 0
## '12' 0 993 0 0 15 0
## '5' 0 102 671 8 44 2
## '3' 0 0 5 322 297 67
Let’s examine the distributions of marker mRNAs in the Leiden totalVI clusters corresponding to OSCA cluster “3”:
totsce[,which(altExp(totsce)$leiden %in% c("5", "6", "8"))]
tsub = c("GZMH", "IL7R", "KLRB1")
markers <-altExp(tsub)$leiden = factor(altExp(tsub)$leiden) # squelch unused levels
$leiden = factor(tsub$leiden) # squelch unused levels
tsubplotExpression(tsub, x="leiden",
features=markers, swap_rownames="Symbol", ncol=3)
Note that the y axis label is incorrect – we are plotting the denoised expression values from totalVI.
The display seems roughly consistent with the “boolean basis” observed above with the mRNA-based subclustering.
The same approach is taken as above. We don’t have TPT1 in the 4000 genes retained in the totalVI exercise.
rownames(altExp(tsub))
rn = gsub("_TotalSeqB", "", rn)
rn =rownames(altExp(tsub)) = rn
rowData(altExp(tsub)) = DataFrame(Symbol=rn)
plotExpression(tsub, x="CD127", show_smooth=TRUE, show_se=FALSE,
features=c("IL7R", "KLRB1", "GZMH"), swap_rownames="Symbol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
We have shown how rudimentary programming and data organization can be used to make outputs of OSCA and totalVI methods amenable to comparison in the Bioconductor framework.
The scviR package includes a shiny app in the function
explore_subcl
that should be expanded to facilitate
exploration of totalVI subclusters. Much more work
remains to be done in the area of exploring
additional approaches to integrative interpretation of ADT and mRNA abundance patterns, such as intersection and concatenation methods in the feature selection materials in OSCA ch. 12
effects of tuning and architecture details for the totalVI VAE
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] Matrix_1.6-1.1 scater_1.30.0
## [3] scuttle_1.12.0 reshape2_1.4.4
## [5] ggplot2_3.4.4 scviR_1.2.0
## [7] SingleCellExperiment_1.24.0 SummarizedExperiment_1.32.0
## [9] Biobase_2.62.0 GenomicRanges_1.54.0
## [11] GenomeInfoDb_1.38.0 IRanges_2.36.0
## [13] S4Vectors_0.40.0 BiocGenerics_0.48.0
## [15] MatrixGenerics_1.14.0 matrixStats_1.0.0
## [17] shiny_1.7.5.1 basilisk_1.14.0
## [19] BiocStyle_2.30.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.1.3 bitops_1.0-7
## [3] gridExtra_2.3 rlang_1.1.1
## [5] magrittr_2.0.3 compiler_4.3.1
## [7] RSQLite_2.3.1 mgcv_1.9-0
## [9] dir.expiry_1.10.0 DelayedMatrixStats_1.24.0
## [11] png_0.1-8 vctrs_0.6.4
## [13] stringr_1.5.0 pkgconfig_2.0.3
## [15] crayon_1.5.2 fastmap_1.1.1
## [17] magick_2.8.1 dbplyr_2.3.4
## [19] XVector_0.42.0 ellipsis_0.3.2
## [21] labeling_0.4.3 utf8_1.2.4
## [23] promises_1.2.1 rmarkdown_2.25
## [25] ggbeeswarm_0.7.2 purrr_1.0.2
## [27] bit_4.0.5 xfun_0.40
## [29] zlibbioc_1.48.0 cachem_1.0.8
## [31] beachmat_2.18.0 jsonlite_1.8.7
## [33] blob_1.2.4 later_1.3.1
## [35] DelayedArray_0.28.0 BiocParallel_1.36.0
## [37] irlba_2.3.5.1 parallel_4.3.1
## [39] R6_2.5.1 stringi_1.7.12
## [41] RColorBrewer_1.1-3 bslib_0.5.1
## [43] limma_3.58.0 reticulate_1.34.0
## [45] jquerylib_0.1.4 Rcpp_1.0.11
## [47] bookdown_0.36 knitr_1.44
## [49] splines_4.3.1 httpuv_1.6.12
## [51] tidyselect_1.2.0 abind_1.4-5
## [53] yaml_2.3.7 viridis_0.6.4
## [55] codetools_0.2-19 curl_5.1.0
## [57] plyr_1.8.9 lattice_0.22-5
## [59] tibble_3.2.1 withr_2.5.1
## [61] basilisk.utils_1.14.0 evaluate_0.22
## [63] BiocFileCache_2.10.0 pillar_1.9.0
## [65] BiocManager_1.30.22 filelock_1.0.2
## [67] generics_0.1.3 RCurl_1.98-1.12
## [69] sparseMatrixStats_1.14.0 munsell_0.5.0
## [71] scales_1.2.1 xtable_1.8-4
## [73] glue_1.6.2 pheatmap_1.0.12
## [75] tools_4.3.1 BiocNeighbors_1.20.0
## [77] ScaledMatrix_1.10.0 cowplot_1.1.1
## [79] grid_4.3.1 colorspace_2.1-0
## [81] nlme_3.1-163 GenomeInfoDbData_1.2.11
## [83] beeswarm_0.4.0 BiocSingular_1.18.0
## [85] vipor_0.4.5 cli_3.6.1
## [87] rsvd_1.0.5 fansi_1.0.5
## [89] S4Arrays_1.2.0 viridisLite_0.4.2
## [91] dplyr_1.1.3 gtable_0.3.4
## [93] sass_0.4.7 digest_0.6.33
## [95] SparseArray_1.2.0 ggrepel_0.9.4
## [97] farver_2.1.1 memoise_2.0.1
## [99] htmltools_0.5.6.1 lifecycle_1.0.3
## [101] httr_1.4.7 statmod_1.5.0
## [103] mime_0.12 bit64_4.0.5