--- title: "Case Study: Cancer Survival Analysis" author: - name: "Zaoqu Liu" email: "liuzaoqu@163.com" affiliation: "Department of Interventional Radiology, The First Affiliated Hospital of Zhengzhou University" orcid: "0000-0002-0452-742X" - name: "Aimin Xie" email: "aiminyy1993@gmail.com" affiliation: "Original Author" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 number_sections: true vignette: > %\VignetteIndexEntry{Case Study: Cancer Survival Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, warning = FALSE, message = FALSE, dpi = 150 ) ``` # Introduction This case study demonstrates how to use scPAS to identify **survival-associated cell subpopulations** in cancer single-cell data by integrating bulk RNA-seq data with patient survival information. ## Biological Question > Which cell subpopulations in the tumor microenvironment are associated with patient prognosis? ## Analysis Overview 1. Prepare single-cell RNA-seq data from tumor samples 2. Prepare bulk RNA-seq data with survival outcomes 3. Train scPAS model with Cox regression 4. Identify cells associated with good/poor prognosis 5. Characterize survival-associated subpopulations # Setup ```{r load-packages} library(scPAS) library(Seurat) library(Matrix) library(ggplot2) library(survival) library(survminer) library(RColorBrewer) library(patchwork) theme_set(theme_minimal()) ``` # Simulated Cancer Data For this demonstration, we create realistic simulated data mimicking a tumor microenvironment study. ## Single-Cell Data: Tumor Microenvironment ```{r simulate-sc-data} set.seed(123) n_genes <- 800 n_cells <- 1000 # Create sparse count matrix counts <- matrix(0, nrow = n_genes, ncol = n_cells) for (i in 1:n_genes) { # Variable expression rates lambda <- sample(c(1, 3, 5, 10), 1, prob = c(0.3, 0.4, 0.2, 0.1)) counts[i, ] <- rpois(n_cells, lambda) } rownames(counts) <- paste0("Gene", 1:n_genes) colnames(counts) <- paste0("Cell", 1:n_cells) # Create Seurat object tumor_sc <- CreateSeuratObject(counts = counts, project = "TumorME") # Define cell types (typical tumor microenvironment) cell_types <- c( rep("Malignant", 300), # Tumor cells rep("CD8_T_exhausted", 100), # Exhausted T cells rep("CD8_T_effector", 100), # Effector T cells rep("Treg", 80), # Regulatory T cells rep("TAM_M1", 80), # M1 macrophages (anti-tumor) rep("TAM_M2", 120), # M2 macrophages (pro-tumor) rep("CAF", 100), # Cancer-associated fibroblasts rep("Endothelial", 70), # Endothelial cells rep("DC", 50) # Dendritic cells ) tumor_sc$celltype <- cell_types # Standard preprocessing tumor_sc <- NormalizeData(tumor_sc, verbose = FALSE) tumor_sc <- FindVariableFeatures(tumor_sc, nfeatures = 500, verbose = FALSE) tumor_sc <- ScaleData(tumor_sc, verbose = FALSE) tumor_sc <- RunPCA(tumor_sc, npcs = 30, verbose = FALSE) tumor_sc <- RunUMAP(tumor_sc, dims = 1:20, verbose = FALSE) # Visualize DimPlot(tumor_sc, group.by = "celltype", label = TRUE, repel = TRUE) + ggtitle("Tumor Microenvironment Cell Types") ``` ## Bulk Data: TCGA-like Cohort with Survival ```{r simulate-bulk-data} set.seed(456) n_bulk_samples <- 80 # Create bulk expression matrix bulk_data <- matrix( rnorm(n_genes * n_bulk_samples, mean = 8, sd = 2), nrow = n_genes, ncol = n_bulk_samples ) rownames(bulk_data) <- paste0("Gene", 1:n_genes) colnames(bulk_data) <- paste0("Patient", 1:n_bulk_samples) # Simulate survival data # Higher expression of certain genes → worse survival prognostic_genes <- sample(1:n_genes, 50) risk_score_bulk <- colMeans(bulk_data[prognostic_genes, ]) # Generate survival times (exponential with risk-dependent rate) base_survival <- 1000 # days survival_time <- rexp(n_bulk_samples, rate = 0.001 * exp(scale(risk_score_bulk))) survival_time <- pmin(survival_time, 2000) # Cap at 2000 days # Generate censoring censor_time <- runif(n_bulk_samples, 500, 2500) observed_time <- pmin(survival_time, censor_time) event_status <- as.integer(survival_time <= censor_time) # Create survival object survival_phenotype <- Surv(time = observed_time, event = event_status) cat("Bulk samples:", n_bulk_samples, "\n") cat("Events:", sum(event_status), "\n") cat("Censored:", sum(1 - event_status), "\n") cat("Median follow-up:", median(observed_time), "days\n") ``` ## Visualize Survival Data ```{r kaplan-meier-bulk, fig.width=8, fig.height=5} # Split by median risk risk_group <- ifelse(risk_score_bulk > median(risk_score_bulk), "High", "Low") km_data <- data.frame( time = observed_time, status = event_status, risk_group = risk_group ) fit <- survfit(Surv(time, status) ~ risk_group, data = km_data) ggsurvplot( fit, data = km_data, pval = TRUE, risk.table = TRUE, palette = c("royalblue", "indianred"), title = "Bulk Sample Survival by Risk Score", xlab = "Time (days)", ylab = "Survival Probability" ) ``` # Run scPAS with Cox Regression ```{r run-scpas-cox} # Run scPAS analysis result <- scPAS( bulk_dataset = bulk_data, sc_dataset = tumor_sc, phenotype = survival_phenotype, family = "cox", nfeature = 300, permutation_times = 200, # Use 1000+ in practice do_imputation = FALSE, n_cores = 1, FDR.threshold = 0.05 ) # Summary cat("Total cells:", ncol(result), "\n") cat("scPAS+ (poor prognosis):", sum(result$scPAS == "scPAS+", na.rm = TRUE), "\n") cat("scPAS- (good prognosis):", sum(result$scPAS == "scPAS-", na.rm = TRUE), "\n") cat("Non-significant:", sum(result$scPAS == "0", na.rm = TRUE), "\n") ``` # Visualize Results ## UMAP Overview ```{r umap-results, fig.width=14, fig.height=5} # Cell type colors ct_colors <- c( "Malignant" = "#E41A1C", "CD8_T_exhausted" = "#377EB8", "CD8_T_effector" = "#4DAF4A", "Treg" = "#984EA3", "TAM_M1" = "#FF7F00", "TAM_M2" = "#FFFF33", "CAF" = "#A65628", "Endothelial" = "#F781BF", "DC" = "#999999" ) class_colors <- c("scPAS-" = "royalblue", "0" = "gray80", "scPAS+" = "indianred") p1 <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE) + ggtitle("Cell Types") + NoLegend() p2 <- FeaturePlot(result, features = "scPAS_NRS") + scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", midpoint = 0) + ggtitle("Risk Score") p3 <- DimPlot(result, group.by = "scPAS", cols = class_colors, order = c("0", "scPAS-", "scPAS+")) + ggtitle("Prognosis Association") p1 | p2 | p3 ``` ## Cell Type Enrichment ```{r enrichment-analysis, fig.width=10, fig.height=6} # Calculate enrichment enrichment_table <- table(result$celltype, result$scPAS) enrichment_df <- as.data.frame(enrichment_table) colnames(enrichment_df) <- c("CellType", "scPAS", "Count") # Calculate proportions enrichment_df <- enrichment_df %>% dplyr::group_by(CellType) %>% dplyr::mutate( Total = sum(Count), Proportion = Count / Total * 100 ) %>% dplyr::ungroup() # Focus on scPAS+ (poor prognosis) scpas_positive <- enrichment_df %>% dplyr::filter(scPAS == "scPAS+") %>% dplyr::arrange(desc(Proportion)) ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) + geom_bar(stat = "identity", width = 0.7) + scale_fill_manual(values = ct_colors) + coord_flip() + labs( x = "", y = "% scPAS+ cells", title = "Cell Types Enriched for Poor Prognosis (scPAS+)" ) + theme( legend.position = "none", plot.title = element_text(hjust = 0.5, face = "bold") ) ``` ## Violin Plot by Cell Type ```{r violin-survival, fig.width=12, fig.height=6} # Order by median risk score cell_order <- result@meta.data %>% dplyr::group_by(celltype) %>% dplyr::summarise(median_risk = median(scPAS_NRS, na.rm = TRUE)) %>% dplyr::arrange(desc(median_risk)) %>% dplyr::pull(celltype) result$celltype <- factor(result$celltype, levels = cell_order) VlnPlot(result, features = "scPAS_NRS", group.by = "celltype", cols = ct_colors[cell_order], pt.size = 0) + geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") + stat_summary(fun = median, geom = "point", size = 2, color = "black") + labs( x = "Cell Type (ordered by median risk)", y = "Normalized Risk Score", title = "Survival-Associated Risk Score by Cell Type" ) + theme( axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none" ) ``` # Biological Interpretation ## Key Findings Based on the analysis (with real data, findings would differ): ```{r interpretation-plot, fig.width=10, fig.height=6} # Create interpretation summary interpretation_data <- data.frame( CellType = c("TAM_M2", "Treg", "CAF", "CD8_T_exhausted", "TAM_M1", "CD8_T_effector", "DC"), Association = c("Poor", "Poor", "Poor", "Poor", "Good", "Good", "Good"), Mechanism = c( "Immunosuppressive", "Immunosuppressive", "TME remodeling", "Dysfunctional", "Anti-tumor", "Cytotoxic", "Antigen presentation" ) ) interpretation_data$Association <- factor(interpretation_data$Association, levels = c("Good", "Poor")) ggplot(interpretation_data, aes(x = reorder(CellType, as.numeric(Association)), y = 1, fill = Association)) + geom_tile(color = "white", size = 1) + geom_text(aes(label = Mechanism), size = 3.5, color = "white", fontface = "bold") + scale_fill_manual(values = c("Good" = "royalblue", "Poor" = "indianred")) + coord_flip() + labs( x = "", y = "", title = "Biological Interpretation of Survival Associations", fill = "Prognosis" ) + theme( axis.text.x = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), plot.title = element_text(hjust = 0.5, face = "bold") ) ``` ## Publication Figure ```{r publication-figure, fig.width=14, fig.height=10} # Create comprehensive figure layout <- " AABB CCDD EEEE " p_a <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE, label.size = 3) + ggtitle("A. Tumor Microenvironment") + NoLegend() p_b <- FeaturePlot(result, features = "scPAS_NRS", pt.size = 0.5) + scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", midpoint = 0, name = "Risk\nScore") + ggtitle("B. Survival Risk Score") p_c <- DimPlot(result, group.by = "scPAS", cols = class_colors, pt.size = 0.5, order = c("0", "scPAS-", "scPAS+")) + ggtitle("C. Prognosis Classification") p_d <- ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) + geom_bar(stat = "identity") + scale_fill_manual(values = ct_colors) + coord_flip() + labs(x = "", y = "% Poor Prognosis (scPAS+)") + ggtitle("D. Cell Type Enrichment") + theme(legend.position = "none") p_e <- VlnPlot(result, features = "scPAS_NRS", group.by = "celltype", cols = ct_colors[cell_order], pt.size = 0) + geom_hline(yintercept = 0, linetype = "dashed") + ggtitle("E. Risk Score Distribution by Cell Type") + NoLegend() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) p_a + p_b + p_c + p_d + p_e + plot_layout(design = layout) ``` # Model Application to New Data The trained model can be applied to independent datasets: ```{r model-application, eval=FALSE} # Get trained model parameters (compatible with Seurat 4 and 5) model_params <- Seurat::Misc(result, slot = "scPAS_para") head(sort(model_params$Coefs, decreasing = TRUE)) # Apply to new bulk data new_bulk_predictions <- scPAS.prediction( model = result, test.data = new_bulk_expression, do_imputation = FALSE ) # Apply to spatial transcriptomics spatial_predictions <- scPAS.prediction( model = result, test.data = spatial_seurat, assay = "Spatial", do_imputation = TRUE ) ``` # Key Takeaways 1. **scPAS+ cells** (associated with poor prognosis): - M2 tumor-associated macrophages (immunosuppressive) - Regulatory T cells (Tregs) - Cancer-associated fibroblasts - Exhausted CD8+ T cells 2. **scPAS- cells** (associated with good prognosis): - M1 macrophages (anti-tumor) - Effector CD8+ T cells - Dendritic cells 3. **Clinical implications**: - Target immunosuppressive populations for therapy - Enhance anti-tumor immune responses - Biomarker development for prognosis prediction # Session Information ```{r session-info} sessionInfo() ```