It may be of interest to use the embedding that is calculated on a training sample set to predict scores on a test set (or, equivalently, on new data).
After loading the nipalsMCIA
library, we randomly split the NCI60 cancer cell
line data into training and test sets.
# install.packages("devtools")
devtools::install_github("Muunraker/nipalsMCIA", ref = "code-development",
force = TRUE, build_vignettes = TRUE) # devel version
# after acceptance
# install.packages("BiocManager")
BiocManager::install("nipalsMCIA")
library(ggplot2)
library(MultiAssayExperiment)
library(nipalsMCIA)
data(NCI60)
set.seed(8)
num_samples <- dim(data_blocks[[1]])[1]
num_train <- round(num_samples * 0.7, 0)
train_samples <- sample.int(num_samples, num_train)
data_blocks_train <- data_blocks
data_blocks_test <- data_blocks
for (i in seq_along(data_blocks)) {
data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ]
data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ]
}
# Split corresponding metadata
metadata_train <- data.frame(metadata_NCI60[train_samples, ],
row.names = rownames(data_blocks_train$mrna))
colnames(metadata_train) <- c("cancerType")
metadata_test <- data.frame(metadata_NCI60[-train_samples, ],
row.names = rownames(data_blocks_test$mrna))
colnames(metadata_test) <- c("cancerType")
# Create train and test mae objects
data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample",
colData = metadata_train)
data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample",
colData = metadata_test)
MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae,
col_preproc_method = "colprofile", num_PCs = 10,
plots = "none", tol = 1e-9)
The get_metadata_colors()
function returns an assignment of a color for the
metadata columns. The nmb_get_gs()
function returns the global scores from the
input NipalsResult
object.
meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1,
color_pal_params = list(option = "E"))
global_scores <- nmb_get_gs(MCIA_train)
MCIA_out <- data.frame(global_scores[, 1:2])
MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType
colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType")
# plot the results
ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) +
geom_point(size = 3) +
labs(title = "MCIA for NCI60 training data") +
scale_color_manual(values = meta_colors) +
theme_bw()
We use the function to generate new factor scores on the test
data set using the MCIA_train model. The new dataset in the form of an MAE object
is input using the parameter test_data
.
MCIA_test_scores <- predict_gs(mcia_results = MCIA_train,
test_data = data_blocks_test_mae)
We once again plot the top two factor scores for both the training and test datasets
MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2])
MCIA_out_test$cancerType <- MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType
colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType")
MCIA_out_test$set <- "test"
MCIA_out$set <- "train"
MCIA_out_full <- rbind(MCIA_out, MCIA_out_test)
rownames(MCIA_out_full) <- NULL
# plot the results
ggplot(data = MCIA_out_full,
aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) +
geom_point(size = 3) +
labs(title = "MCIA for NCI60 training and test data") +
scale_color_manual(values = meta_colors) +
theme_bw()
Session Info
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MultiAssayExperiment_1.28.0 SummarizedExperiment_1.32.0
## [3] Biobase_2.62.0 GenomicRanges_1.54.0
## [5] GenomeInfoDb_1.38.0 BiocGenerics_0.48.0
## [7] MatrixGenerics_1.14.0 matrixStats_1.0.0
## [9] SeuratObject_4.1.4 Seurat_4.4.0
## [11] piggyback_0.1.5 IRanges_2.36.0
## [13] S4Vectors_0.40.0 stringr_1.5.0
## [15] nipalsMCIA_1.0.0 ggpubr_0.6.0
## [17] ggplot2_3.4.4 fgsea_1.28.0
## [19] dplyr_1.1.3 ComplexHeatmap_2.18.0
## [21] BiocStyle_2.30.0
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.21 splines_4.3.1 later_1.3.1
## [4] bitops_1.0-7 tibble_3.2.1 polyclip_1.10-6
## [7] httr2_0.2.3 lifecycle_1.0.3 rstatix_0.7.2
## [10] doParallel_1.0.17 globals_0.16.2 lattice_0.22-5
## [13] MASS_7.3-60 backports_1.4.1 magrittr_2.0.3
## [16] plotly_4.10.3 sass_0.4.7 rmarkdown_2.25
## [19] jquerylib_0.1.4 yaml_2.3.7 httpuv_1.6.12
## [22] sctransform_0.4.1 sp_2.1-1 spatstat.sparse_3.0-3
## [25] reticulate_1.34.0 cowplot_1.1.1 pbapply_1.7-2
## [28] RColorBrewer_1.1-3 lubridate_1.9.3 abind_1.4-5
## [31] zlibbioc_1.48.0 Rtsne_0.16 purrr_1.0.2
## [34] RCurl_1.98-1.12 pracma_2.4.2 rappdirs_0.3.3
## [37] circlize_0.4.15 GenomeInfoDbData_1.2.11 ggrepel_0.9.4
## [40] irlba_2.3.5.1 gitcreds_0.1.2 listenv_0.9.0
## [43] spatstat.utils_3.0-4 goftest_1.2-3 RSpectra_0.16-1
## [46] spatstat.random_3.2-1 fitdistrplus_1.1-11 parallelly_1.36.0
## [49] leiden_0.4.3 codetools_0.2-19 DelayedArray_0.28.0
## [52] tidyselect_1.2.0 shape_1.4.6 farver_2.1.1
## [55] spatstat.explore_3.2-5 jsonlite_1.8.7 GetoptLong_1.0.5
## [58] ellipsis_0.3.2 progressr_0.14.0 ggridges_0.5.4
## [61] survival_3.5-7 iterators_1.0.14 foreach_1.5.2
## [64] tools_4.3.1 ica_1.0-3 Rcpp_1.0.11
## [67] glue_1.6.2 gridExtra_2.3 SparseArray_1.2.0
## [70] BiocBaseUtils_1.4.0 xfun_0.40 withr_2.5.1
## [73] BiocManager_1.30.22 fastmap_1.1.1 fansi_1.0.5
## [76] digest_0.6.33 timechange_0.2.0 R6_2.5.1
## [79] mime_0.12 colorspace_2.1-0 scattermore_1.2
## [82] Cairo_1.6-1 tensor_1.5 spatstat.data_3.0-3
## [85] utf8_1.2.4 tidyr_1.3.0 generics_0.1.3
## [88] data.table_1.14.8 httr_1.4.7 htmlwidgets_1.6.2
## [91] S4Arrays_1.2.0 uwot_0.1.16 pkgconfig_2.0.3
## [94] gtable_0.3.4 lmtest_0.9-40 XVector_0.42.0
## [97] htmltools_0.5.6.1 carData_3.0-5 bookdown_0.36
## [100] clue_0.3-65 scales_1.2.1 png_0.1-8
## [103] knitr_1.44 reshape2_1.4.4 rjson_0.2.21
## [106] curl_5.1.0 nlme_3.1-163 cachem_1.0.8
## [109] zoo_1.8-12 GlobalOptions_0.1.2 KernSmooth_2.23-22
## [112] vipor_0.4.5 parallel_4.3.1 miniUI_0.1.1.1
## [115] ggrastr_1.0.2 pillar_1.9.0 vctrs_0.6.4
## [118] RANN_2.6.1 promises_1.2.1 car_3.1-2
## [121] xtable_1.8-4 cluster_2.1.4 beeswarm_0.4.0
## [124] evaluate_0.22 magick_2.8.1 cli_3.6.1
## [127] compiler_4.3.1 rlang_1.1.1 crayon_1.5.2
## [130] future.apply_1.11.0 ggsignif_0.6.4 labeling_0.4.3
## [133] ggbeeswarm_0.7.2 fs_1.6.3 plyr_1.8.9
## [136] stringi_1.7.12 deldir_1.0-9 viridisLite_0.4.2
## [139] BiocParallel_1.36.0 munsell_0.5.0 gh_1.4.0
## [142] lazyeval_0.2.2 spatstat.geom_3.2-7 Matrix_1.6-1.1
## [145] patchwork_1.1.3 future_1.33.0 shiny_1.7.5.1
## [148] ROCR_1.0-11 igraph_1.5.1 broom_1.0.5
## [151] memoise_2.0.1 bslib_0.5.1 fastmatch_1.1-4