## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) ## ----installation-github, eval = FALSE---------------------------------------- # # devel version # # # install.packages("devtools") # devtools::install_github("Muunraker/nipalsMCIA", ref = "devel", # force = TRUE, build_vignettes = TRUE) # devel version ## ----installation-bioconductor, eval = FALSE---------------------------------- # # release version # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("nipalsMCIA") ## ----load-packages, message = FALSE------------------------------------------- # note that the TENxPBMCData package is not included in this list as you may # decide to pull data from another source or use our provided objects library(BiocFileCache) library(dplyr) library(ggplot2) library(ggpubr) library(nipalsMCIA) library(piggyback) library(Seurat) # NIPALS starts with a random vector set.seed(42) ## ----set-paths---------------------------------------------------------------- # if you would like to save any of the data loaded/created locally path_data <- file.path("..", "data") ## ----cache-------------------------------------------------------------------- # as suggested in: https://bioconductor.org/packages/release/bioc/vignettes/BiocFileCache/inst/doc/BiocFileCache.html#cache-to-manage-package-data # since we only use the cache in this vignette, we don't need wrapper functions # initialize the cache in a temporary directory bfc <- BiocFileCache::BiocFileCache() # set up or load the unique resource ids rid <- BiocFileCache::bfcquery(bfc, query = "vignette_2", "rname")$rid # rid <- bfcrid(bfc) # we only have one rname, so you could also just use this # if the data is not in the cache, download and add it if (length(rid) == 0) { # specify `tag = ` to use a different release other than latest urls <- piggyback::pb_download_url(repo = "Muunraker/nipalsMCIA", tag = "latest") # you could add all of the urls to rid at once, but it would throw this error # when it comes to downloading them: # "Error in parse_url(url) : length(url) == 1 is not TRUE" for (url in urls) { # the rids are numbered in the order they are added e.g. BFC1, BFC2, ... message(paste("Downloading file", url)) rid <- names(BiocFileCache::bfcadd(bfc, rname = "vignette_2", url)) # check if the cached data needs updating and download it if needed if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid))) { # this will overwrite existing files without asking BiocFileCache::bfcdownload(bfc, rid, ask = FALSE) } } } # return the paths to the cached files # BiocFileCache::bfcrpath(bfc, rid = rid) ## ----cache-temp-dir----------------------------------------------------------- # you can also run tools::R_user_dir("BiocFileCache", which = "cache") BiocFileCache::getBFCOption("CACHE") ## ----pipelines, echo = FALSE-------------------------------------------------- # pipeline image knitr::include_graphics(path = BiocFileCache::bfcquery(bfc, "Pipeline")$rpath) ## ----data-all-list------------------------------------------------------------ # list all of the currently available files in the latest release piggyback::pb_list(repo = "Muunraker/nipalsMCIA", tag = "latest") ## ----data-all-download-------------------------------------------------------- # files needed for running MCIA path_metadata_sc <- BiocFileCache::bfcquery(bfc, query = "metadata_sc")$rpath # MCIA results # path_data_blocks <- bfcquery(bfc, query = "data_blocks")$rpath path_mcia_results <- BiocFileCache::bfcquery(bfc, query = "mcia_results_sc")$rpath # marker genes for cell type annotation with Seurat path_marker_genes <- BiocFileCache::bfcquery(bfc, query = "marker_genes")$rpath # the Seurat data's metric summary file from 10x Genomics path_metrics_summary <- BiocFileCache::bfcquery(bfc, query = "metrics_summary")$rpath # Seurat objects in different stages of processing path_obj_raw <- BiocFileCache::bfcquery(bfc, query = "CITEseq_raw")$rpath path_obj_annotated <- BiocFileCache::bfcquery(bfc, query = "CITEseq_annotated")$rpath ## ----data-bioconductor-load, eval = FALSE------------------------------------- # # read in the data as a SingleCellExperiment object # tenx_pbmc5k <- TENxPBMCData::TENxPBMCData(dataset = "pbmc5k-CITEseq") # # # examine the data # tenx_pbmc5k # ## class: SingleCellExperiment # ## dim: 33538 5247 # ## metadata(0): # ## assays(1): counts # ## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475 ENSG00000268674 # ## rowData names(4): ENSEMBL_ID Symbol_TENx Type Symbol # ## colnames: NULL # ## colData names(11): Sample Barcode ... Individual Date_published # ## reducedDimNames(0): # ## mainExpName: Gene Expression # ## altExpNames(1): Antibody Capture # # counts(tenx_pbmc5k) # ## <33538 x 5247> sparse matrix of class DelayedMatrix and type "integer": # ## [, 1] [, 2] [, 3] [, 4] ... [, 5244] [, 5245] [, 5246] [, 5247] # ## ENSG00000243485 0 0 0 0 . 0 0 0 0 # ## ENSG00000237613 0 0 0 0 . 0 0 0 0 # ## ENSG00000186092 0 0 0 0 . 0 0 0 0 # ## ENSG00000238009 0 0 0 0 . 0 0 0 0 # ## ENSG00000239945 0 0 0 0 . 0 0 0 0 # ## ... . . . . . . . . . # ## ENSG00000277856 0 0 0 0 . 0 0 0 0 # ## ENSG00000275063 0 0 0 0 . 0 0 0 0 # ## ENSG00000271254 0 0 0 0 . 0 0 0 0 # ## ENSG00000277475 0 0 0 0 . 0 0 0 0 # ## ENSG00000268674 0 0 0 0 . 0 0 0 0 # # counts(altExp(tenx_pbmc5k)) # ## <32 x 5247> sparse matrix of class DelayedMatrix and type "integer": # ## [, 1] [, 2] [, 3] [, 4] ... [, 5244] [, 5245] [, 5246] [, 5247] # ## CD3 25 959 942 802 . 402 401 6 1773 # ## CD4 164 720 1647 1666 . 1417 1 46 1903 # ## CD8a 16 8 21 5 . 8 222 3 9 # ## CD11b 3011 12 11 11 . 15 7 1027 9 # ## CD14 696 12 13 9 . 9 17 382 8 # ## ... . . . . . . . . . # ## HLA-DR 573 15 11 19 . 6 40 184 32 # ## TIGIT 10 3 3 3 . 2 15 1 12 # ## IgG1 4 4 2 4 . 1 0 2 4 # ## IgG2a 1 3 0 6 . 4 0 4 2 # ## IgG2b 6 2 4 8 . 0 0 2 5 # # # examine the metadata: # head(colData(tenx_pbmc5k), n = 3) # ## DataFrame with 6 rows and 11 columns # ## Sample Barcode Sequence Library Cell_ranger_version Tissue_status Barcode_type # ## # ## 1 pbmc5k-CITEseq AAACCCAAGAGACAAG-1 AAACCCAAGAGACAAG 1 v3.0.2 NA Chromium # ## 2 pbmc5k-CITEseq AAACCCAAGGCCTAGA-1 AAACCCAAGGCCTAGA 1 v3.0.2 NA Chromium # ## 3 pbmc5k-CITEseq AAACCCAGTCGTGCCA-1 AAACCCAGTCGTGCCA 1 v3.0.2 NA Chromium # ## Chemistry Sequence_platform Individual Date_published # ## # ## 1 Chromium_v3 NovaSeq HealthyDonor 2019-05-29 # ## 2 Chromium_v3 NovaSeq HealthyDonor 2019-05-29 # ## 3 Chromium_v3 NovaSeq HealthyDonor 2019-05-29 # # head(rowData(tenx_pbmc5k), n = 3) # ## DataFrame with 6 rows and 4 columns # ## ENSEMBL_ID Symbol_TENx Type Symbol # ## # ## ENSG00000243485 ENSG00000243485 MIR1302-2HG Gene Expression NA # ## ENSG00000237613 ENSG00000237613 FAM138A Gene Expression FAM138A # ## ENSG00000186092 ENSG00000186092 OR4F5 Gene Expression OR4F5 # # metadata(tenx_pbmc5k) # ## list() # # # change the gene names from Ensembl IDs to the 10x genes # rownames(tenx_pbmc5k) <- rowData(tenx_pbmc5k)$Symbol_TENx ## ----data-bioconductor-formatting, eval = FALSE------------------------------- # # set up the list # data_blocks_sc_sce <- list() # data_blocks_sc_sce$mrna <- data.frame(as.matrix(counts(tenx_pbmc5k))) # data_blocks_sc_sce$adt <- data.frame(as.matrix(counts(altExp(tenx_pbmc5k)))) # # summary(data_blocks_sc_sce) # ## Length Class Mode # ## mrna 5247 data.frame list # ## adt 5247 data.frame list # # # convert to a Seurat object (using `as.Seurat` won't work here) # obj_sce <- CreateSeuratObject(counts = data_blocks_sc_sce$mrna, # assay = "RNA" # project = "pbmc5k_CITEseq") # obj_sce[["ADT"]] <- CreateAssayObject(counts = data_blocks_sc_sce$adt) # # # name the cells with their barcodes # obj_sce <- RenameCells(object = obj_sce, # new.names = colData(tenx_pbmc5k)$Sequence) # # # add metadata from the SingleCellExperiment object # obj_sce <- AddMetaData(object = obj_sce, # metadata = as.data.frame(colData(tenx_pbmc5k), # row.names = Cells(obj_sce))) # # # this object will be slightly different than from the Seurat one down below # # e.g. 5297 rows vs. 4193 rows (since QC wasn't done) and different metadata # # head(obj_sce[[]], n = 3) # ## orig.ident nCount_RNA nFeature_RNA nCount_ADT nFeature_ADT Sample Barcode # ## AAACCCAAGAGACAAG SeuratProject 7375 2363 5178 31 pbmc5k-CITEseq AAACCCAAGAGACAAG-1 # ## AAACCCAAGGCCTAGA SeuratProject 3772 1259 2893 29 pbmc5k-CITEseq AAACCCAAGGCCTAGA-1 # ## AAACCCAGTCGTGCCA SeuratProject 4902 1578 3635 29 pbmc5k-CITEseq AAACCCAGTCGTGCCA-1 # ## Sequence Library Cell_ranger_version Tissue_status Barcode_type Chemistry Sequence_platform # ## AAACCCAAGAGACAAG AAACCCAAGAGACAAG 1 v3.0.2 Chromium Chromium_v3 NovaSeq # ## AAACCCAAGGCCTAGA AAACCCAAGGCCTAGA 1 v3.0.2 Chromium Chromium_v3 NovaSeq # ## AAACCCAGTCGTGCCA AAACCCAGTCGTGCCA 1 v3.0.2 Chromium Chromium_v3 NovaSeq # ## Individual Date_published # ## AAACCCAAGAGACAAG HealthyDonor 2019-05-29 # ## AAACCCAAGGCCTAGA HealthyDonor 2019-05-29 # ## AAACCCAGTCGTGCCA HealthyDonor 2019-05-29 # # # save the data locally if desired # save(data_blocks_sc_sce, obj_sce, # file = file.path(path_data, "data_sc_sce.Rda")) ## ----data-10x-object, eval = FALSE-------------------------------------------- # # load the data (change the file path as needed) # data <- Seurat::Read10X(data.dir = file.path(path_data, "tenx_pbmc5k_CITEseq", # "filtered_feature_bc_matrix"), # strip.suffix = TRUE) # remove the "-1"s from barcodes # ## 10X data contains more than one type and is being returned as a list # ## containing matrices of each type. # # # set minimum cells and/or features here if you'd like # obj <- Seurat::CreateSeuratObject(counts = data$`Gene Expression`, # project = "pbmc5k_CITEseq") # obj[["ADT"]] <- Seurat::CreateAssayObject(counts = data$`Antibody Capture`) # ## Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-') # # # check the assays # Seurat::Assays(object = obj) # ## "RNA" "ADT" # # # list out the CITE-Seq surface protein markers # rownames(obj[["ADT"]]) # ## [1] "CD3-TotalSeqB" "CD4-TotalSeqB" "CD8a-TotalSeqB" # ## [4] "CD11b-TotalSeqB" "CD14-TotalSeqB" "CD15-TotalSeqB" # ## [7] "CD16-TotalSeqB" "CD19-TotalSeqB" "CD20-TotalSeqB" # ## [10] "CD25-TotalSeqB" "CD27-TotalSeqB" "CD28-TotalSeqB" # ## [13] "CD34-TotalSeqB" "CD45RA-TotalSeqB" "CD45RO-TotalSeqB" # ## [16] "CD56-TotalSeqB" "CD62L-TotalSeqB" "CD69-TotalSeqB" # ## [19] "CD80-TotalSeqB" "CD86-TotalSeqB" "CD127-TotalSeqB" # ## [22] "CD137-TotalSeqB" "CD197-TotalSeqB" "CD274-TotalSeqB" # ## [25] "CD278-TotalSeqB" "CD335-TotalSeqB" "PD-1-TotalSeqB" # ## [28] "HLA-DR-TotalSeqB" "TIGIT-TotalSeqB" "IgG1-control-TotalSeqB" # ## [31] "IgG2a-control-TotalSeqB" "IgG2b-control-TotalSeqB" # # # save the data locally if desired # saveRDS(obj, file.path(path_data, "tenx_pbmc5k_CITEseq_raw.rds")) ## ----mcia-metadata------------------------------------------------------------ # read in the annotated cells metadata_sc <- read.csv(file = path_metadata_sc, header = TRUE, row.names = 1) # examples metadata_sc %>% slice_sample(n = 5) ## ----mcia-decomp-load-data, eval = FALSE-------------------------------------- # # load the object set up for running MCIA [10x Genomics & Seurat] # # (if you save it in the following code block or download it from a release) # load(file = path_data_blocks) ## ----mcia-decomp-run, eval = FALSE-------------------------------------------- # # "largest_sv" results in a more balanced contribution # # from the blocks than the default "unit_var" # set.seed(42) # # # convert data_blocks_sc to an MAE object using the SingleCellExperiment class # data_blocks_sc_mae <- # MultiAssayExperiment::MultiAssayExperiment(lapply(data_blocks_sc, function(x) # SingleCellExperiment::SingleCellExperiment(t(as.matrix(x)))), # colData = metadata_sc) # # mcia_results_sc <- nipals_multiblock(data_blocks = data_blocks_sc_mae, # col_preproc_method = "colprofile", # block_preproc_method = "largest_sv", # num_PCs = 10, tol = 1e-9, # deflationMethod = "global", # plots = "none") # ## Performing column-level pre-processing... # ## Column pre-processing completed. # ## Performing block-level preprocessing... # ## Block pre-processing completed. # ## Computing order 1 scores # ## Computing order 2 scores # ## Computing order 3 scores # ## Computing order 4 scores # ## Computing order 5 scores # ## Computing order 6 scores # ## Computing order 7 scores # ## Computing order 8 scores # ## Computing order 9 scores # ## Computing order 10 scores # # # saveRDS(mcia_results_sc, file = file.path(path_data, "mcia_results_sc.Rds")) ## ----mcia-decomp-load--------------------------------------------------------- # load the results of the previous block (if already run and saved) mcia_results_sc <- readRDS(file = path_mcia_results) mcia_results_sc ## ----mcia-plots-colors-------------------------------------------------------- # for the projection plot # technically you could just do color_pal_params = list(option = "D"), but saving # the colors is useful for other plots like in the Seurat section meta_colors_sc <- get_metadata_colors(mcia_results = mcia_results_sc, color_col = "CellType", color_pal = scales::viridis_pal, color_pal_params = list(option = "D")) # for other plots colors_omics_sc <- get_colors(mcia_results = mcia_results_sc) ## ----mcia-plots-eigenvalue-scree, fig.dim = c(5, 4)--------------------------- global_scores_eigenvalues_plot(mcia_results = mcia_results_sc) ## ----mcia-plots-projection, fig.dim = c(5, 5)--------------------------------- projection_plot(mcia_results = mcia_results_sc, projection = "global", orders = c(1, 2), color_col = "CellType", color_pal = meta_colors_sc, legend_loc = "bottomright") ## ----mcia-plots-heatmap-global, fig.dim = c(7, 5)----------------------------- suppressMessages(global_scores_heatmap(mcia_results = mcia_results_sc, color_col = "CellType", color_pal = meta_colors_sc)) ## ----mcia-plots-heatmap-block, fig.dim = c(4, 2.5)---------------------------- block_weights_heatmap(mcia_results = mcia_results_sc) ## ----mcia-plots-loadings, fig.dim = c(6, 4.5)--------------------------------- vis_load_plot(mcia_results = mcia_results_sc, axes = c(1, 4), color_pal = colors_omics_sc) ## ----mcia-plots-top-features-factor-1, fig.dim = c(10, 4)--------------------- all_pos_1_vis <- vis_load_ord(mcia_results_sc, omic = "all", factor = 1, absolute = FALSE, descending = TRUE) mrna_pos_1_vis <- vis_load_ord(mcia_results_sc, omic = "mrna", factor = 1, absolute = FALSE, descending = TRUE) ggpubr::ggarrange(all_pos_1_vis, mrna_pos_1_vis, widths = c(1, 1)) ## ----mcia-plots-top-features-factor-1-orderings------------------------------- all_pos_1 <- ord_loadings(mcia_results_sc, omic = "all", factor = 1, absolute = FALSE, descending = TRUE) mrna_pos_1 <- ord_loadings(mcia_results_sc, omic = "mrna", factor = 1, absolute = FALSE, descending = TRUE) ## ----mcia-plots-top-features-factor-1-orderings-1----------------------------- # visualize the tabular data all_pos_1[1:3, ] ## ----mcia-plots-top-features-factor-1-orderings-2----------------------------- mrna_pos_1[1:3, ] ## ----mcia-plots-top-features-factor-4, fig.dim = c(10, 5)--------------------- vis_load_ord(mcia_results_sc, omic = "all", factor = 4, n_feat = 60, absolute = FALSE, descending = TRUE) ## ----seurat-load-data, message = FALSE---------------------------------------- # load the data obj <- readRDS(file = path_obj_raw) # add useful metadata obj[["percent.mt"]] <- PercentageFeatureSet(object = obj, pattern = "^MT-") ## ----seurat-qc-metrics-summary------------------------------------------------ # read in and display the summary table metrics_summary <- read.csv(file = path_metrics_summary) metrics_summary ## ----seurat-qc-pre-filter, fig.dim = c(8, 5)---------------------------------- VlnPlot(object = obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size = 0.01, ncol = 3) & theme(axis.text.x = element_text(angle = 0, hjust = 0.5), axis.title.x = element_blank()) ## ----seurat-qc-post-filter, fig.dim = c(8, 5)--------------------------------- # adjust cutoffs as desired obj <- subset(x = obj, subset = nFeature_RNA > 200 & percent.mt < 20) VlnPlot(object = obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size = 0.01, ncol = 3) & theme(axis.text.x = element_text(angle = 0, hjust = 0.5), axis.title.x = element_blank()) ## ----seurat-pipeline-1, eval = FALSE------------------------------------------ # # standard log normalization for RNA and centered log for ADT # obj <- NormalizeData(object = obj, normalization.method = "LogNormalize", # scale.factor = 10000, assay = "RNA") # ## Performing log-normalization # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # # obj <- NormalizeData(object = obj, normalization.method = "CLR", # margin = 2, assay = "ADT") # go across cells, not features # ## Normalizing across cells # ## |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s # # # highly variable features # obj <- FindVariableFeatures(object = obj, selection.method = "vst", # nfeatures = 2000) # ## Calculating gene variances # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # ## Calculating feature variances of standardized and clipped values # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # # # scaling so the average expression is 0 and the variance is 1 # obj <- ScaleData(object = obj, features = rownames(obj)) # ## Centering and scaling data matrix # ## |===============================================================================================| 100% # # # dimensionality reduction # obj <- RunPCA(object = obj) # ## PC_ 1 # ## Positive: LYZ, FCN1, CST3, MNDA, CTSS, PSAP, S100A9, FGL2, AIF1, GRN # ## NCF2, LST1, CD68, TYMP, SERPINA1, CYBB, CLEC12A, CSTA, SPI1, TNFAIP2 # ## CPVL, VCAN, MPEG1, TYROBP, KLF4, FTL, S100A8, IGSF6, CD14, MS4A6A # ## Negative: CD3D, TRAC, LTB, TRBC2, IL32, CD3G, IL7R, CD69, CD247, TRBC1 # ## CD2, CD7, CD27, ARL4C, ISG20, HIST1H4C, SYNE2, GZMM, ITM2A, CCR7 # ## RORA, MAL, CXCR4, LEF1, TRAT1, CTSW, GZMA, KLRB1, TRABD2A, CCL5 # ## PC_ 2 # ## Positive: CD79A, MS4A1, IGHM, BANK1, HLA-DQA1, CD79B, IGKC, LINC00926, RALGPS2, TNFRSF13C # ## VPREB3, IGHD, SPIB, CD22, FCRL1, HLA-DQB1, BLK, FAM129C, FCRLA, TCL1A # ## GNG7, TCF4, COBLL1, PAX5, SWAP70, CD40, BCL11A, P2RX5, TSPAN13, ADAM28 # ## Negative: NKG7, CST7, GZMA, PRF1, KLRD1, CTSW, FGFBP2, GNLY, GZMH, CCL5 # ## GZMM, CD247, KLRF1, HOPX, SPON2, ADGRG1, TRDC, MATK, GZMB, FCGR3A # ## S100A4, CCL4, CLIC3, KLRB1, IL2RB, TBX21, TTC38, ANXA1, PTGDR, PLEKHF1 # ## PC_ 3 # ## Positive: GZMB, NKG7, GNLY, CLIC3, PRF1, KLRD1, FGFBP2, KLRF1, SPON2, CST7 # ## GZMH, FCGR3A, ADGRG1, GZMA, HOPX, CTSW, TRDC, CCL4, HLA-DPB1, C12orf75 # ## PLAC8, TTC38, PLEK, APOBEC3G, TBX21, PRSS23, CYBA, MATK, SYNGR1, CXXC5 # ## Negative: IL7R, MAL, LEF1, TRABD2A, TRAC, CCR7, LTB, CD27, FOS, LRRN3 # ## FHIT, TRAT1, RGCC, CAMK4, CD3D, RGS10, CD40LG, FOSB, AQP3, SOCS3 # ## FLT3LG, CD3G, SLC2A3, TSHZ2, VIM, S100A12, S100A8, CD28, PLK3, VCAN # ## PC_ 4 # ## Positive: FCER1A, PLD4, SERPINF1, IL3RA, CLEC10A, GAS6, LILRA4, TPM2, CLEC4C, ENHO # ## FLT3, SMPD3, ITM2C, LGMN, CD1C, P2RY14, PPP1R14B, SCT, PROC, LAMP5 # ## RUNX2, AC119428.2, PACSIN1, DNASE1L3, PTCRA, RGS10, UGCG, CLIC2, PPM1J, P2RY6 # ## Negative: MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C, VPREB3, CD79B, RALGPS2, IGHD, FCRL1 # ## BLK, IGHM, CD22, PAX5, ARHGAP24, CD24, P2RX5, NCF1, S100A12, CD19 # ## SWAP70, FCRLA, VNN2, TNFRSF13B, FCER2, IGKC, FCRL2, RBP7, CD40, S100A8 # ## PC_ 5 # ## Positive: BATF3, C1QA, TCF7L2, CTSL, CDKN1C, HLA-DQB1, HES4, SIGLEC10, CLEC10A, ABI3 # ## HLA-DQA1, RHOC, CSF1R, ENHO, CAMK1, MTSS1, IFITM3, CD1C, LY6E, FCGR3A # ## HLA-DPA1, CLIC2, HLA-DPB1, YBX1, RRAS, AC064805.1, NR4A1, GBP1, ZNF703, CXCL16 # ## Negative: LILRA4, TPM2, CLEC4C, SMPD3, IL3RA, SERPINF1, DERL3, MZB1, SCT, JCHAIN # ## PACSIN1, PROC, S100A12, PTCRA, LINC00996, PADI4, ASIP, KCNK17, ITM2C, EPHB1 # ## ALOX5AP, LAMP5, DNASE1L3, MAP1A, S100A8, APP, CYP1B1, VNN2, UGCG, ZFAT # # # clustering (adjust dimensions and resolutions as desired) # obj <- FindNeighbors(object = obj, reduction = "pca", dims = 1:20) # ## Computing nearest neighbor graph # ## Computing SNN # # obj <- FindClusters(object = obj, resolution = 0.4) # ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck # ## # ## Number of nodes: 4193 # ## Number of edges: 154360 # ## # ## Running Louvain algorithm... # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # ## Maximum modularity in 10 random starts: 0.9097 # ## Number of communities: 10 # ## Elapsed time: 0 seconds # # obj <- RunUMAP(object = obj, reduction = "pca", dims = 1:20) # ## 14:44:35 UMAP embedding parameters a = 0.9922 b = 1.112 # ## 14:44:35 Read 4193 rows and found 20 numeric columns # ## 14:44:35 Using Annoy for neighbor search, n_neighbors = 30 # ## 14:44:35 Building Annoy index with metric = cosine, n_trees = 50 # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # ## 14:44:36 Writing NN index file to temp file /tmp/RtmpHhoQAU/filec346711dd9f93 # ## 14:44:36 Searching Annoy index using 1 thread, search_k = 3000 # ## 14:44:37 Annoy recall = 100% # ## 14:44:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30 # ## 14:44:39 Initializing from normalized Laplacian + noise (using RSpectra) # ## 14:44:39 Commencing optimization for 500 epochs, with 174318 positive edges # ## Using method 'umap' # ## 0% 10 20 30 40 50 60 70 80 90 100% # ## [----|----|----|----|----|----|----|----|----|----| # ## **************************************************| # ## 14:44:44 Optimization finished ## ----seurat-load-processed-object--------------------------------------------- obj <- readRDS(file = path_obj_annotated) # reset to baseline Idents(object = obj) <- "seurat_clusters" obj$annotated_clusters <- c() ## ----seurat-elbow, fig.dim = c(6, 3)------------------------------------------ ElbowPlot(object = obj, ndims = 30) ## ----seurat-top-features------------------------------------------------------ head(VariableFeatures(object = obj), 20) ## ----seurat-top-features-plot, fig.dim = c(6, 6), warning = FALSE, eval = FALSE---- # # these can be plotted if desired # LabelPoints(plot = VariableFeaturePlot(object = obj), # points = head(VariableFeatures(object = obj), 20), # repel = TRUE, xnudge = 0, ynudge = 0) + # labs(title = "Top Twenty Variable Features") + # theme(legend.position = "bottom") ## ----seurat-umap-initial, fig.dim = c(6, 6), message = FALSE------------------ # you can also use the LabelClusters function to help label individual clusters plot_seurat_clusters <- UMAPPlot(object = obj, label = TRUE, label.size = 4, label.box = TRUE) + labs(title = "Initial Clusters") + scale_fill_manual( values = rep("white", n_distinct(obj$seurat_clusters))) + theme(plot.title = element_text(hjust = 0.5), axis.text = element_blank(), axis.ticks = element_blank()) plot_seurat_clusters ## ----seurat-markers-load------------------------------------------------------ # sort by Cell_Type and Marker if not already sorted markers_all <- read.csv(file = path_marker_genes) markers_all %>% slice_sample(n = 10) # example markers ## ----seurat-markers-dot-gex, fig.dim = c(16, 6), warning = FALSE-------------- select_cell_types <- c("B", "Macrophages", "mDC", "NK", "T") # do features = unique(markers_all$Marker) to use all possible features p <- DotPlot(object = obj, features = markers_all %>% dplyr::filter(Cell_Type %in% select_cell_types) %>% distinct(Marker) %>% pull(), cols = "RdBu", col.min = -1, dot.scale = 3, cluster.idents = TRUE) # add in the cell type information # if desired, you could rename the "Cell_Type"s in the original database to be # more informative e.g. Natural_Killers instead of NK p$data <- left_join(p$data, markers_all %>% dplyr::filter(Cell_Type %in% select_cell_types) %>% dplyr::rename(features.plot = "Marker"), by = "features.plot", multiple = "all") # depending on your version of dplyr, you can set `relationship = "many-to-many"` # to surpress the warning # plot p + facet_grid(cols = vars(Cell_Type), scales = "free_x", space = "free") + theme(strip.text.x = element_text(size = 10)) + RotatedAxis() ## ----seurat-markers-dot-adt, fig.dim = c(16, 6)------------------------------- # you have to change the default assay for the dot plot DefaultAssay(object = obj) <- "ADT" DotPlot(object = obj, features = rownames(GetAssayData(object = obj, assay = "ADT", slot = "counts")), cols = "RdBu", col.min = -1, dot.scale = 3, cluster.idents = TRUE) + RotatedAxis() # reset the default DefaultAssay(object = obj) <- "RNA" ## ----seurat-markers-feature, fig.dim = c(12, 8)------------------------------- FeaturePlot(object = obj, features = c("MS4A1", "CD14", # MS4A1 is another name for CD20 "NKG7", "IL3RA", "CD3D", "IL7R"), min.cutoff = 0, label = TRUE, label.size = 4, ncol = 3, raster = FALSE) ## ----seurat-markers-violin-adt, fig.dim = c(8, 8)----------------------------- VlnPlot(object = obj, features = c("CD3-TotalSeqB", "CD14-TotalSeqB", "CD20-TotalSeqB", "CD335-TotalSeqB"), pt.size = 0, assay = "ADT", ncol = 2) & theme(plot.title = element_text(size = 10), axis.text.x = element_text(angle = 0, hjust = 0.5), axis.title.x = element_blank()) ## ----seurat-annotations------------------------------------------------------- # mDC = myeloid dendritic cells # pDC = plasmacytoid dendritic cells obj_annotations <- rbind(c("0", "Macrophages/mDCs"), # difficult to separate c("1", "T Cells"), c("2", "T Cells"), c("3", "T Cells"), c("4", "Natural Killers"), c("5", "B Cells"), c("6", "Macrophages/mDCs"), # difficult to separate c("7", "Macrophages/mDCs"), # difficult to separate c("8", "T Cells"), # faint signal c("9", "pDCs")) colnames(obj_annotations) <- c("Cluster", "CellType") obj_annotations <- data.frame(obj_annotations) # save the annotations as a csv if you'd like # write.csv(obj_annotations, file = file.path(path_data, "obj_annotations.csv")) # prepare the annotation information annotations <- setNames(obj_annotations$CellType, obj_annotations$Cluster) # relabel the Seurat clusters # Idents(obj) <- "seurat_clusters" obj <- RenameIdents(object = obj, annotations) # alphabetize the cell types Idents(obj) <- factor(Idents(object = obj), levels = sort(levels(obj))) # useful metadata (e.g. if you want to have multiple annotation sets) obj[["annotated_clusters"]] <- Idents(object = obj) # save the processed and annotated Seurat object # saveRDS(obj, file = file.path(path_data, "tenx_pbmc5k_CITEseq_annotated.rds")) # info about the clusters obj_annotations %>% group_by(CellType) %>% transmute(Clusters = paste0(Cluster, collapse = ", ")) %>% distinct() %>% arrange(CellType) # metadata file for MCIA meta <- data.frame("CellType" = Idents(object = obj)) # save the metadata file for MCIA # write.csv(meta, file = file.path(path_data, "metadata_sc.csv")) ## ----seurat-umap-both, fig.dim = c(16, 8), message = FALSE-------------------- # change colors and themes as desired plot_annotated_clusters <- UMAPPlot(object = obj, label = TRUE, label.size = 4, label.box = TRUE, cols = meta_colors_sc) + labs(title = "Annotated Clusters") + scale_fill_manual( values = rep("white", length(meta_colors_sc))) + theme(plot.title = element_text(hjust = 0.5), axis.text = element_blank(), axis.ticks = element_blank()) + NoLegend() # a legend is not needed here ggpubr::ggarrange(plot_seurat_clusters + NoLegend(), plot_annotated_clusters, heights = c(1, 1), widths = c(1, 1)) ## ----seurat-umap-adt, fig.dim = c(6, 6)--------------------------------------- # plot ADT information on top as an example FeaturePlot(object = obj, features = "adt_CD19-TotalSeqB", label = TRUE, label.size = 4, ncol = 1, raster = FALSE) ## ----seurat-check-annotations-violin, fig.dim = c(8, 8)----------------------- VlnPlot(object = obj, features = c("CD3D", "CD4", "IL7R", "TRAC"), cols = meta_colors_sc, pt.size = 0.01, ncol = 2) & theme(plot.title = element_text(size = 10), axis.title.x = element_blank()) ## ----seurat-check-annotations-bar, fig.dim = c(8, 6)-------------------------- # examine the total counts ggplot(obj[[]], aes(x = annotated_clusters, fill = annotated_clusters)) + geom_bar(color = "black", linewidth = 0.2) + labs(title = "pbmc5k-CITEseq Cell Type Counts", x = "Cell Type", y = "Count") + scale_fill_manual(values = meta_colors_sc) + theme_bw() + theme(plot.title = element_text(size = 14, hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), legend.position = "none") ## ----seurat-save-mcia, eval = FALSE------------------------------------------- # # mrna # data_rna <- GetAssayData(object = obj, slot = "data", # assay = "RNA")[VariableFeatures(object = obj), ] # data_rna <- as.matrix(data_rna) # data_rna <- t(data_rna) # switch the rows # # colnames(data_rna) <- paste(colnames(data_rna), "mrna", sep = "_") # # # adt # data_adt <- GetAssayData(object = obj, slot = "data", assay = "ADT") # data_adt <- as.matrix(data_adt) # data_adt <- t(data_adt) # switch the rows # # colnames(data_adt) <- paste(colnames(data_adt), "adt", sep = "_") # # # combined # data_blocks_sc <- list(mrna = data_rna, adt = data_adt) # # # examine the contents # data.frame(data_blocks_sc$mrna[1:5, 1:5]) # data.frame(data_blocks_sc$adt[1:5, 1:5]) # # # save(data_blocks_sc, file = file.path(path_data, "data_blocks_sc.Rda")) ## ----session-info------------------------------------------------------------- sessionInfo()