--- title: "Bulk RNA-seq Deconvolution" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 fig_caption: true vignette: > %\VignetteIndexEntry{Bulk RNA-seq Deconvolution} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ``` ## Introduction **Bulk RNA-seq deconvolution** is the computational process of estimating cell type proportions from bulk tissue transcriptomic data. This vignette demonstrates how to use darwin for the complete deconvolution workflow. ## The Deconvolution Problem ### Mathematical Formulation The bulk expression of a tissue sample can be modeled as a weighted sum of cell type expression profiles: $$\mathbf{b} = \mathbf{X} \cdot \mathbf{p} + \boldsymbol{\epsilon}$$ where: - $\mathbf{b} \in \mathbb{R}^n$: Bulk expression vector (n genes) - $\mathbf{X} \in \mathbb{R}^{n \times k}$: Reference signature matrix (n genes × k cell types) - $\mathbf{p} \in \mathbb{R}^k$: Cell type proportions to estimate - $\boldsymbol{\epsilon}$: Noise term ### Why Gene Selection Matters The choice of genes significantly impacts deconvolution accuracy: - **Too few genes**: Insufficient information for accurate estimation - **Too many genes**: Noise and uninformative genes degrade performance - **Correlated genes**: Multicollinearity leads to unstable estimates darwin addresses this by selecting genes that minimize correlation while maximizing cell type separability. ## Complete Workflow ### Step 1: Load Package and Data ```{r load} library(darwin) library(ggplot2) set.seed(42) ``` ### Step 2: Create Reference Matrix In practice, this would come from single-cell RNA-seq data. ```{r reference} # Simulate reference data: 8 cell types × 500 genes n_celltypes <- 8 n_genes <- 500 cell_types <- c("B_cells", "CD4_T", "CD8_T", "NK", "Monocytes", "Dendritic", "Macrophages", "Neutrophils") reference <- matrix( abs(rnorm(n_celltypes * n_genes, mean = 3, sd = 1.5)), nrow = n_celltypes, ncol = n_genes ) rownames(reference) <- cell_types colnames(reference) <- paste0("Gene", 1:n_genes) # Add cell-type specific marker expression marker_strength <- c(5, 4, 4, 4.5, 3.5, 4, 3.5, 3) for (i in 1:n_celltypes) { markers <- ((i - 1) * 20 + 1):(i * 20) reference[i, markers] <- reference[i, markers] + marker_strength[i] } cat("Reference matrix dimensions:", dim(reference), "\n") ``` ### Step 3: Create Simulated Bulk Data We create bulk samples with known cell type proportions for validation. ```{r bulk-creation} # True proportions for 10 samples n_samples <- 10 true_proportions <- matrix(0, nrow = n_samples, ncol = n_celltypes) colnames(true_proportions) <- cell_types rownames(true_proportions) <- paste0("Sample", 1:n_samples) # Generate random proportions that sum to 1 for (i in 1:n_samples) { props <- runif(n_celltypes) true_proportions[i, ] <- props / sum(props) } # Generate bulk expression: bulk = proportions %*% reference + noise bulk <- true_proportions %*% reference bulk <- bulk + matrix(rnorm(n_samples * n_genes, sd = 0.5), nrow = n_samples) bulk[bulk < 0] <- 0 cat("Bulk matrix dimensions:", dim(bulk), "\n") cat("\nTrue proportions (first 3 samples):\n") print(round(true_proportions[1:3, ], 3)) ``` ### Step 4: Optimize Gene Selection ```{r optimize} # Initialize darwin dw <- darwin(reference) # Run multi-objective optimization dw$optimize( ngen = 100, pop_size = 80, objectives = c("correlation", "distance"), weights = c(-1, 1), verbose = FALSE, parallel = FALSE ) # Visualize Pareto front dw$plot() ``` ### Step 5: Select Optimal Genes ```{r select} # Select solution balancing both objectives dw$select(weights = c(-1, 1)) # Check selection statistics genes <- dw$get_genes() selection <- dw$get_selection() cat("Selected", length(genes), "genes\n") cat("Correlation score:", round(compute_correlation(reference[, selection]), 3), "\n") cat("Distance score:", round(compute_distance(reference[, selection]), 3), "\n") ``` ### Step 6: Perform Deconvolution darwin supports three deconvolution methods: #### Non-Negative Least Squares (NNLS) ```{r deconv-nnls} result_nnls <- dw$deconvolve(bulk, method = "nnls") cat("Estimated proportions (NNLS):\n") print(round(result_nnls$proportions[1:3, ], 3)) ``` #### Linear Regression ```{r deconv-linear} result_linear <- dw$deconvolve(bulk, method = "linear") cat("Estimated proportions (Linear):\n") print(round(result_linear$proportions[1:3, ], 3)) ``` ### Step 7: Evaluate Accuracy ```{r evaluate, fig.cap="Comparison of true vs estimated proportions."} # Compare estimated to true proportions estimated <- result_nnls$proportions # Calculate correlation per cell type correlations <- sapply(1:n_celltypes, function(i) { cor(true_proportions[, i], estimated[, i]) }) names(correlations) <- cell_types # Calculate RMSE per cell type rmse <- sapply(1:n_celltypes, function(i) { sqrt(mean((true_proportions[, i] - estimated[, i])^2)) }) names(rmse) <- cell_types # Summary table accuracy <- data.frame( CellType = cell_types, Correlation = round(correlations, 3), RMSE = round(rmse, 4) ) knitr::kable(accuracy, caption = "Deconvolution accuracy by cell type") ``` ```{r scatter-plot, fig.cap="Scatter plot of true vs estimated proportions for each cell type."} # Create comparison data frame df_compare <- data.frame( True = as.vector(true_proportions), Estimated = as.vector(estimated), CellType = rep(cell_types, each = n_samples), Sample = rep(rownames(true_proportions), n_celltypes) ) # Overall correlation overall_cor <- cor(df_compare$True, df_compare$Estimated) ggplot(df_compare, aes(x = True, y = Estimated, color = CellType)) + geom_point(size = 2, alpha = 0.7) + geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray40") + scale_color_brewer(palette = "Set2") + labs( title = "True vs Estimated Cell Type Proportions", subtitle = paste("Overall Pearson r =", round(overall_cor, 3)), x = "True Proportion", y = "Estimated Proportion", color = "Cell Type" ) + theme_minimal(base_size = 12) + coord_fixed(xlim = c(0, 0.4), ylim = c(0, 0.4)) ``` ```{r facet-plot, fig.cap="Per-cell-type comparison of true vs estimated proportions."} ggplot(df_compare, aes(x = True, y = Estimated)) + geom_point(color = "#3498db", alpha = 0.7) + geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#e74c3c") + facet_wrap(~CellType, scales = "free") + labs( title = "Deconvolution Accuracy by Cell Type", x = "True Proportion", y = "Estimated Proportion" ) + theme_minimal(base_size = 10) ``` ## Comparing Gene Selection Strategies ```{r compare-strategies, fig.cap="Impact of gene selection on deconvolution accuracy."} # Strategy 1: All genes all_genes_cor <- sapply(1:n_celltypes, function(i) { # Deconvolve with all genes (simplified) X <- t(reference) y <- t(bulk) coef_mat <- solve(t(X) %*% X) %*% t(X) %*% y estimated_all <- t(coef_mat) estimated_all <- estimated_all / rowSums(abs(estimated_all)) cor(true_proportions[, i], estimated_all[, i]) }) # Strategy 2: darwin selected genes darwin_cor <- correlations # Strategy 3: Random genes (same number) n_selected <- sum(selection) set.seed(123) random_selection <- sample(1:n_genes, n_selected) X_random <- t(reference[, random_selection]) y_random <- t(bulk[, random_selection]) coef_random <- solve(t(X_random) %*% X_random) %*% t(X_random) %*% y_random estimated_random <- t(coef_random) estimated_random <- estimated_random / rowSums(abs(estimated_random)) random_cor <- sapply(1:n_celltypes, function(i) { cor(true_proportions[, i], estimated_random[, i]) }) # Compare comparison <- data.frame( CellType = rep(cell_types, 3), Correlation = c(all_genes_cor, darwin_cor, random_cor), Strategy = rep(c("All Genes", "darwin", "Random"), each = n_celltypes) ) ggplot(comparison, aes(x = CellType, y = Correlation, fill = Strategy)) + geom_bar(stat = "identity", position = "dodge") + scale_fill_manual(values = c("#95a5a6", "#3498db", "#e74c3c")) + labs( title = "Deconvolution Accuracy by Gene Selection Strategy", x = "Cell Type", y = "Correlation (True vs Estimated)" ) + theme_minimal(base_size = 12) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + coord_cartesian(ylim = c(0, 1)) ``` ## Best Practices ### 1. Reference Matrix Quality - Use high-quality single-cell reference data - Ensure all relevant cell types are represented - Consider batch effects between reference and bulk data ### 2. Gene Selection - Run optimization with sufficient generations (≥100) - Compare multiple solutions from the Pareto front - Validate with known mixtures when possible ### 3. Method Selection | Method | Advantages | Disadvantages | |--------|------------|---------------| | NNLS | Non-negative estimates | May underestimate | | NuSVR | Robust to outliers | Requires tuning | | Linear | Fast, analytical | May give negative values | ### 4. Validation - Use in silico mixtures for benchmarking - Cross-validate with flow cytometry when available - Check that proportions sum to approximately 1 ## Session Info ```{r session} sessionInfo() ```