## ----eval = FALSE------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("xCell2") ## ----eval = FALSE------------------------------------------------------------- # if (!requireNamespace("devtools", quietly = TRUE)) { # install.packages("devtools") # } # devtools::install_github("AlmogAngel/xCell2") ## ----eval = TRUE-------------------------------------------------------------- library(xCell2) ## ----eval = TRUE-------------------------------------------------------------- library(xCell2) # Load the demo data data(dice_demo_ref, package = "xCell2") # Extract reference gene expression matrix # (Note that values are in TPM but log transformed) dice_ref <- SummarizedExperiment::assay(dice_demo_ref, "logcounts") colnames(dice_ref) <- make.unique(colnames(dice_ref)) # Ensure unique sample names # Extract reference metadata dice_labels <- SummarizedExperiment::colData(dice_demo_ref) dice_labels <- as.data.frame(dice_labels) # "label" column already exists # Prepare labels data frame (the "label" column already exist) dice_labels$ont <- NA # Here we assign the cell ontology (next section) dice_labels$sample <- colnames(dice_ref) dice_labels$dataset <- "DICE" ## ----eval = TRUE-------------------------------------------------------------- # Assign ontologies to cell types dice_labels[dice_labels$label == "B cells", ]$ont <- "CL:0000236" dice_labels[dice_labels$label == "Monocytes", ]$ont <- "CL:0000576" dice_labels[dice_labels$label == "NK cells", ]$ont <- "CL:0000623" dice_labels[dice_labels$label == "T cells, CD8+", ]$ont <- "CL:0000625" dice_labels[dice_labels$label == "T cells, CD4+", ]$ont <- "CL:0000624" dice_labels[dice_labels$label == "T cells, CD4+, memory", ]$ont <- "CL:0000897" ## ----eval=FALSE, include=FALSE------------------------------------------------ # xCell2GetLineage(labels = dice_labels, outFile = "demo_dice_dep.tsv") ## ----eval = TRUE-------------------------------------------------------------- lineage_list <- xCell2GetLineage(labels = dice_labels) ## ----eval = TRUE-------------------------------------------------------------- library(BiocParallel) parallel_param <- MulticoreParam(workers = 2) # Adjust workers as needed set.seed(123) # (optional) For reproducibility DICE_demo.xCell2Ref <- xCell2Train( ref = dice_ref, labels = dice_labels, refType = "rnaseq", BPPARAM = parallel_param ) ## ----eval = FALSE------------------------------------------------------------- # saveRDS(DICE_demo.xCell2Ref, file = "DICE_demo.xCell2Ref.rds") ## ----eval = FALSE------------------------------------------------------------- # # Load the BlueprintEncode reference # data(BlueprintEncode.xCell2Ref, package = "xCell2") ## ----eval = FALSE------------------------------------------------------------- # # Set the URL of the pre-trained reference # ref_url <- "https://dviraran.github.io/xCell2refs/references/BlueprintEncode.xCell2Ref.rds" # # Set the local filename to save the reference # local_filename <- "BlueprintEncode.xCell2Ref.rds" # # Download the file # download.file(ref_url, local_filename, mode = "wb") # # Load the downloaded reference # BlueprintEncode.xCell2Ref <- readRDS(local_filename) ## ----eval = TRUE-------------------------------------------------------------- library(xCell2) data("DICE_demo.xCell2Ref", package = "xCell2") genes_ref <- getGenesUsed(DICE_demo.xCell2Ref) genes_ref[1:10] ## ----eval = TRUE-------------------------------------------------------------- # Load demo bulk gene expression mixture data("mix_demo", package = "xCell2") genes_mix <- rownames(mix_demo) # Calculate the percentage of overlapping genes overlap_percentage <- round(length(intersect(genes_mix, genes_ref)) / length(genes_ref) * 100, 2) print(paste0("Overlap between mixture and reference genes is: ", overlap_percentage, "%")) ## ----eval = TRUE-------------------------------------------------------------- xcell2_results <- xCell2Analysis( mix = mix_demo, xcell2object = DICE_demo.xCell2Ref, minSharedGenes = 0.8 # Allow for a lower overlap threshold ) ## ----eval = TRUE-------------------------------------------------------------- xcell2_results <- xCell2Analysis( mix = mix_demo, xcell2object = DICE_demo.xCell2Ref ) xcell2_results ## ----eval=TRUE---------------------------------------------------------------- library(ggplot2, quietly = TRUE) # Generate pseudo enrichment scores for demonstration set.seed(123) conditions <- c(rep("Healthy", 5), rep("Disease", 5)) enrichment_scores <- data.frame( Condition = conditions, T_cells_CD8 = c(runif(5, 0.4, 0.6), runif(5, 0.6, 0.8)), T_cells_CD4 = c(runif(5, 0.3, 0.5), runif(5, 0.5, 0.7)) ) # Wilcoxon rank-sum test for CD8+ T cells wilcox_test <- wilcox.test(T_cells_CD8 ~ Condition, data = enrichment_scores) print(wilcox_test) # Boxplot with p-value ggplot(enrichment_scores, aes(x = Condition, y = T_cells_CD8)) + geom_boxplot() + geom_jitter(width = 0.2) + labs( title = "CD8+ T Cells Enrichment Across Conditions", x = "Condition", y = "Enrichment Score" ) + annotate("text", x = 1.5, y = max(enrichment_scores$T_cells_CD8), label = paste("p =", signif(wilcox_test$p.value, 3))) ## ----eval=TRUE---------------------------------------------------------------- library(EnhancedVolcano, quietly = TRUE) library(dplyr, quietly = TRUE, verbose = FALSE) # Simulate enrichment scores for 50 cell types set.seed(123) cell_types <- paste0("CellType_", 1:50) enrichment_scores_many <- data.frame( Sample = paste0("Sample", 1:10), Condition = factor(rep(c("Healthy", "Disease"), each = 5)) ) for (cell_type in cell_types) { # Healthy group: random around baseline healthy <- rnorm(5, mean = 0.5, sd = 0.1) # Disease group: add variability and directional shifts disease <- rnorm(5, mean = sample(c(0.4, 0.5, 0.6), 1), sd = 0.1) enrichment_scores_many[[cell_type]] <- c(healthy, disease) } special_cells <- sample(cell_types, 2) for (cell_type in special_cells) { enrichment_scores_many[1:5, cell_type] <- rnorm(5, mean = 0.4, sd = 0.05) # Lower in healthy } # Calculate log fold change (LFC) and p-values for each cell type using Wilcoxon test differential_results <- enrichment_scores_many %>% tidyr::pivot_longer(cols = starts_with("CellType"), names_to = "CellType", values_to = "Enrichment") %>% group_by(CellType) %>% summarise( LFC = log2(median(Enrichment[Condition == "Disease"]) / median(Enrichment[Condition == "Healthy"])), p_value = wilcox.test(Enrichment ~ Condition)$p.value ) %>% ungroup() # Prepare data for EnhancedVolcano volcano_data <- differential_results %>% mutate( neg_log10_p = -log10(p_value), Significance = ifelse(p_value < 0.05, "Significant", "Not Significant") ) %>% rename( log2FC = LFC, pval = p_value ) # Plot using EnhancedVolcano EnhancedVolcano( volcano_data, lab = volcano_data$CellType, x = "log2FC", y = "pval", pCutoff = 0.05, FCcutoff = 0.5, title = "Volcano Plot of Cell Type Enrichment", subtitle = "Simulated Data for 50 Cell Types", xlab = "Log2 Fold Change (LFC)", ylab = "-Log10(p-value)", pointSize = 2.0, labSize = 3.0, xlim = c(-1, 1), ylim = c(0, 3) ) ## ----eval=TRUE---------------------------------------------------------------- # Simulated clinical feature (e.g., mutation burden) set.seed(123) mutation_burden <- runif(10, min = 0, max = 100) # Calculate Pearson and Spearman correlation pearson_corr <- cor.test(enrichment_scores$T_cells_CD8, mutation_burden, method = "pearson") spearman_corr <- cor.test(enrichment_scores$T_cells_CD8, mutation_burden, method = "spearman") # Scatterplot of the correlation with annotations for Pearson and Spearman coefficients ggplot(enrichment_scores, aes(x = mutation_burden, y = T_cells_CD8)) + geom_point(size = 3) + geom_smooth(method = "lm", se = FALSE, color = "blue") + labs( title = "Correlation Between Mutation Burden and CD8+ T Cells Enrichment", x = "Mutation Burden", y = "CD8+ T Cells Enrichment Score" ) + annotate("text", x = 15, y = 0.7, label = paste0("Pearson: ", signif(pearson_corr$estimate, 3), "\nP-value: ", signif(pearson_corr$p.value, 3)), hjust = 0, color = "darkred") + annotate("text", x = 15, y = 0.65, label = paste0("Spearman: ", signif(spearman_corr$estimate, 3), "\nP-value: ", signif(spearman_corr$p.value, 3)), hjust = 0, color = "darkgreen") ## ----eval=TRUE---------------------------------------------------------------- # Simulated enrichment scores for demonstration set.seed(123) enrichment_scores <- data.frame( Sample = paste0("Sample", 1:10), Condition = rep(c("Healthy", "Disease"), each = 5), CD8_T_cells = runif(10, min = 0.1, max = 0.8), B_cells = runif(10, min = 0.2, max = 0.6), NK_cells = runif(10, min = 0.05, max = 0.5) ) # Perform PCA on the simulated enrichment scores pca <- prcomp(enrichment_scores[, 3:5], scale. = TRUE) # Create a data frame for PCA results pca_df <- data.frame( PC1 = pca$x[, 1], PC2 = pca$x[, 2], Condition = enrichment_scores$Condition ) # Visualize the first two principal components ggplot(pca_df, aes(x = PC1, y = PC2, color = Condition)) + geom_point(size = 3) + labs( title = "PCA of Enrichment Scores", x = "PC1 (Principal Component 1)", y = "PC2 (Principal Component 2)" ) + theme_minimal() ## ----eval = TRUE-------------------------------------------------------------- kmeans_result <- kmeans(enrichment_scores[, 3:5], centers = 2, nstart = 25) # Add cluster assignment to the PCA results pca_df <- data.frame( PC1 = pca$x[, 1], PC2 = pca$x[, 2], Condition = enrichment_scores$Condition, Cluster = as.factor(kmeans_result$cluster) ) # Visualize the PCA results with k-means clusters ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, shape = Condition)) + geom_point(size = 3) + labs( title = "PCA of Enrichment Scores with K-Means Clusters", x = "PC1 (Principal Component 1)", y = "PC2 (Principal Component 2)" ) + scale_color_manual(values = c("1" = "blue", "2" = "red")) + theme_minimal() ## ----eval=TRUE, dpi=32-------------------------------------------------------- library(randomForest, quietly = TRUE) # Simulated enrichment scores for demonstration set.seed(123) enrichment_scores <- data.frame( Sample = paste0("Sample", 1:10), Condition = factor(rep(c("Healthy", "Disease"), each = 5)), CD8_T_cells = runif(10, min = 0.1, max = 0.8), B_cells = runif(10, min = 0.2, max = 0.6), NK_cells = runif(10, min = 0.05, max = 0.5) ) # Train a random forest model rf_model <- randomForest(Condition ~ CD8_T_cells + B_cells + NK_cells, data = enrichment_scores, ntree = 500, importance = TRUE) # Assess variable importance var_importance <- data.frame( Variable = rownames(importance(rf_model)), Importance = importance(rf_model)[, 1] ) # Plot variable importance ggplot(var_importance, aes(x = reorder(Variable, Importance), y = Importance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + labs( title = "Feature Importance from Random Forest Model", x = "Features", y = "Importance Score" ) + theme_minimal() ## ----eval = FALSE------------------------------------------------------------- # library(BiocParallel) # # # Example: Setting up parallel processing with 2 workers # BPPARAM <- MulticoreParam(workers = 2) # Adjust the number of workers as needed ## ----eval = FALSE------------------------------------------------------------- # BPPARAM <- SnowParam(workers = 2) ## ----eval = FALSE------------------------------------------------------------- # # Generate the xCell2 reference object with parallel processing # DICE_demo.xCell2Ref <- xCell2Train( # ref = dice_ref, # labels = dice_labels, # refType = "rnaseq", # BPPARAM = BPPARAM # ) ## ----eval = FALSE------------------------------------------------------------- # # Perform cell type enrichment analysis with parallel processing # xcell2_results <- xCell2Analysis( # mix = mix_demo, # xcell2object = DICE_demo.xCell2Ref, # BPPARAM = BPPARAM # ) ## ----------------------------------------------------------------------------- sessionInfo()