---
title: "Case Study: Cancer Survival Analysis"
author: 
  - name: "Zaoqu Liu"
    email: "liuzaoqu@163.com"
    affiliation: "Department of Interventional Radiology, The First Affiliated Hospital of Zhengzhou University"
    orcid: "0000-0002-0452-742X"
  - name: "Aimin Xie"
    email: "aiminyy1993@gmail.com"
    affiliation: "Original Author"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 3
    number_sections: true
vignette: >
  %\VignetteIndexEntry{Case Study: Cancer Survival Analysis}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  warning = FALSE,
  message = FALSE,
  dpi = 150
)
```

# Introduction

This case study demonstrates how to use scPAS to identify **survival-associated cell subpopulations** in cancer single-cell data by integrating bulk RNA-seq data with patient survival information.

## Biological Question

> Which cell subpopulations in the tumor microenvironment are associated with patient prognosis?

## Analysis Overview

1. Prepare single-cell RNA-seq data from tumor samples
2. Prepare bulk RNA-seq data with survival outcomes
3. Train scPAS model with Cox regression
4. Identify cells associated with good/poor prognosis
5. Characterize survival-associated subpopulations

# Setup

```{r load-packages}
library(scPAS)
library(Seurat)
library(Matrix)
library(ggplot2)
library(survival)
library(survminer)
library(RColorBrewer)
library(patchwork)

theme_set(theme_minimal())
```

# Simulated Cancer Data

For this demonstration, we create realistic simulated data mimicking a tumor microenvironment study.

## Single-Cell Data: Tumor Microenvironment

```{r simulate-sc-data}
set.seed(123)

n_genes <- 800
n_cells <- 1000

# Create sparse count matrix
counts <- matrix(0, nrow = n_genes, ncol = n_cells)
for (i in 1:n_genes) {
  # Variable expression rates
  lambda <- sample(c(1, 3, 5, 10), 1, prob = c(0.3, 0.4, 0.2, 0.1))
  counts[i, ] <- rpois(n_cells, lambda)
}
rownames(counts) <- paste0("Gene", 1:n_genes)
colnames(counts) <- paste0("Cell", 1:n_cells)

# Create Seurat object
tumor_sc <- CreateSeuratObject(counts = counts, project = "TumorME")

# Define cell types (typical tumor microenvironment)
cell_types <- c(
  rep("Malignant", 300),       # Tumor cells
  rep("CD8_T_exhausted", 100), # Exhausted T cells
  rep("CD8_T_effector", 100),  # Effector T cells
  rep("Treg", 80),             # Regulatory T cells
  rep("TAM_M1", 80),           # M1 macrophages (anti-tumor)
  rep("TAM_M2", 120),          # M2 macrophages (pro-tumor)
  rep("CAF", 100),             # Cancer-associated fibroblasts
  rep("Endothelial", 70),      # Endothelial cells
  rep("DC", 50)                # Dendritic cells
)
tumor_sc$celltype <- cell_types

# Standard preprocessing
tumor_sc <- NormalizeData(tumor_sc, verbose = FALSE)
tumor_sc <- FindVariableFeatures(tumor_sc, nfeatures = 500, verbose = FALSE)
tumor_sc <- ScaleData(tumor_sc, verbose = FALSE)
tumor_sc <- RunPCA(tumor_sc, npcs = 30, verbose = FALSE)
tumor_sc <- RunUMAP(tumor_sc, dims = 1:20, verbose = FALSE)

# Visualize
DimPlot(tumor_sc, group.by = "celltype", label = TRUE, repel = TRUE) +
  ggtitle("Tumor Microenvironment Cell Types")
```

## Bulk Data: TCGA-like Cohort with Survival

```{r simulate-bulk-data}
set.seed(456)

n_bulk_samples <- 80

# Create bulk expression matrix
bulk_data <- matrix(
  rnorm(n_genes * n_bulk_samples, mean = 8, sd = 2),
  nrow = n_genes,
  ncol = n_bulk_samples
)
rownames(bulk_data) <- paste0("Gene", 1:n_genes)
colnames(bulk_data) <- paste0("Patient", 1:n_bulk_samples)

# Simulate survival data
# Higher expression of certain genes → worse survival
prognostic_genes <- sample(1:n_genes, 50)
risk_score_bulk <- colMeans(bulk_data[prognostic_genes, ])

# Generate survival times (exponential with risk-dependent rate)
base_survival <- 1000  # days
survival_time <- rexp(n_bulk_samples, rate = 0.001 * exp(scale(risk_score_bulk)))
survival_time <- pmin(survival_time, 2000)  # Cap at 2000 days

# Generate censoring
censor_time <- runif(n_bulk_samples, 500, 2500)
observed_time <- pmin(survival_time, censor_time)
event_status <- as.integer(survival_time <= censor_time)

# Create survival object
survival_phenotype <- Surv(time = observed_time, event = event_status)

cat("Bulk samples:", n_bulk_samples, "\n")
cat("Events:", sum(event_status), "\n")
cat("Censored:", sum(1 - event_status), "\n")
cat("Median follow-up:", median(observed_time), "days\n")
```

## Visualize Survival Data

```{r kaplan-meier-bulk, fig.width=8, fig.height=5}
# Split by median risk
risk_group <- ifelse(risk_score_bulk > median(risk_score_bulk), "High", "Low")

km_data <- data.frame(
  time = observed_time,
  status = event_status,
  risk_group = risk_group
)

fit <- survfit(Surv(time, status) ~ risk_group, data = km_data)

ggsurvplot(
  fit,
  data = km_data,
  pval = TRUE,
  risk.table = TRUE,
  palette = c("royalblue", "indianred"),
  title = "Bulk Sample Survival by Risk Score",
  xlab = "Time (days)",
  ylab = "Survival Probability"
)
```

# Run scPAS with Cox Regression

```{r run-scpas-cox}
# Run scPAS analysis
result <- scPAS(
  bulk_dataset = bulk_data,
  sc_dataset = tumor_sc,
  phenotype = survival_phenotype,
  family = "cox",
  nfeature = 300,
  permutation_times = 200,  # Use 1000+ in practice
  do_imputation = FALSE,
  n_cores = 1,
  FDR.threshold = 0.05
)

# Summary
cat("Total cells:", ncol(result), "\n")
cat("scPAS+ (poor prognosis):", sum(result$scPAS == "scPAS+", na.rm = TRUE), "\n")
cat("scPAS- (good prognosis):", sum(result$scPAS == "scPAS-", na.rm = TRUE), "\n")
cat("Non-significant:", sum(result$scPAS == "0", na.rm = TRUE), "\n")
```

# Visualize Results

## UMAP Overview

```{r umap-results, fig.width=14, fig.height=5}
# Cell type colors
ct_colors <- c(
  "Malignant" = "#E41A1C",
  "CD8_T_exhausted" = "#377EB8",
  "CD8_T_effector" = "#4DAF4A",
  "Treg" = "#984EA3",
  "TAM_M1" = "#FF7F00",
  "TAM_M2" = "#FFFF33",
  "CAF" = "#A65628",
  "Endothelial" = "#F781BF",
  "DC" = "#999999"
)

class_colors <- c("scPAS-" = "royalblue", "0" = "gray80", "scPAS+" = "indianred")

p1 <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE) +
  ggtitle("Cell Types") + NoLegend()

p2 <- FeaturePlot(result, features = "scPAS_NRS") +
  scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", midpoint = 0) +
  ggtitle("Risk Score")

p3 <- DimPlot(result, group.by = "scPAS", cols = class_colors,
              order = c("0", "scPAS-", "scPAS+")) +
  ggtitle("Prognosis Association")

p1 | p2 | p3
```

## Cell Type Enrichment

```{r enrichment-analysis, fig.width=10, fig.height=6}
# Calculate enrichment
enrichment_table <- table(result$celltype, result$scPAS)
enrichment_df <- as.data.frame(enrichment_table)
colnames(enrichment_df) <- c("CellType", "scPAS", "Count")

# Calculate proportions
enrichment_df <- enrichment_df %>%
  dplyr::group_by(CellType) %>%
  dplyr::mutate(
    Total = sum(Count),
    Proportion = Count / Total * 100
  ) %>%
  dplyr::ungroup()

# Focus on scPAS+ (poor prognosis)
scpas_positive <- enrichment_df %>%
  dplyr::filter(scPAS == "scPAS+") %>%
  dplyr::arrange(desc(Proportion))

ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(values = ct_colors) +
  coord_flip() +
  labs(
    x = "",
    y = "% scPAS+ cells",
    title = "Cell Types Enriched for Poor Prognosis (scPAS+)"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
```

## Violin Plot by Cell Type

```{r violin-survival, fig.width=12, fig.height=6}
# Order by median risk score
cell_order <- result@meta.data %>%
  dplyr::group_by(celltype) %>%
  dplyr::summarise(median_risk = median(scPAS_NRS, na.rm = TRUE)) %>%
  dplyr::arrange(desc(median_risk)) %>%
  dplyr::pull(celltype)

result$celltype <- factor(result$celltype, levels = cell_order)

VlnPlot(result, features = "scPAS_NRS", group.by = "celltype",
        cols = ct_colors[cell_order], pt.size = 0) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
  stat_summary(fun = median, geom = "point", size = 2, color = "black") +
  labs(
    x = "Cell Type (ordered by median risk)",
    y = "Normalized Risk Score",
    title = "Survival-Associated Risk Score by Cell Type"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )
```

# Biological Interpretation

## Key Findings

Based on the analysis (with real data, findings would differ):

```{r interpretation-plot, fig.width=10, fig.height=6}
# Create interpretation summary
interpretation_data <- data.frame(
  CellType = c("TAM_M2", "Treg", "CAF", "CD8_T_exhausted", 
               "TAM_M1", "CD8_T_effector", "DC"),
  Association = c("Poor", "Poor", "Poor", "Poor",
                  "Good", "Good", "Good"),
  Mechanism = c(
    "Immunosuppressive",
    "Immunosuppressive",
    "TME remodeling",
    "Dysfunctional",
    "Anti-tumor",
    "Cytotoxic",
    "Antigen presentation"
  )
)

interpretation_data$Association <- factor(interpretation_data$Association, levels = c("Good", "Poor"))

ggplot(interpretation_data, aes(x = reorder(CellType, as.numeric(Association)), 
                                y = 1, fill = Association)) +
  geom_tile(color = "white", size = 1) +
  geom_text(aes(label = Mechanism), size = 3.5, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("Good" = "royalblue", "Poor" = "indianred")) +
  coord_flip() +
  labs(
    x = "",
    y = "",
    title = "Biological Interpretation of Survival Associations",
    fill = "Prognosis"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
```

## Publication Figure

```{r publication-figure, fig.width=14, fig.height=10}
# Create comprehensive figure
layout <- "
AABB
CCDD
EEEE
"

p_a <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE, label.size = 3) +
  ggtitle("A. Tumor Microenvironment") + NoLegend()

p_b <- FeaturePlot(result, features = "scPAS_NRS", pt.size = 0.5) +
  scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", 
                        midpoint = 0, name = "Risk\nScore") +
  ggtitle("B. Survival Risk Score")

p_c <- DimPlot(result, group.by = "scPAS", cols = class_colors, pt.size = 0.5,
               order = c("0", "scPAS-", "scPAS+")) +
  ggtitle("C. Prognosis Classification")

p_d <- ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = ct_colors) +
  coord_flip() +
  labs(x = "", y = "% Poor Prognosis (scPAS+)") +
  ggtitle("D. Cell Type Enrichment") +
  theme(legend.position = "none")

p_e <- VlnPlot(result, features = "scPAS_NRS", group.by = "celltype",
               cols = ct_colors[cell_order], pt.size = 0) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  ggtitle("E. Risk Score Distribution by Cell Type") +
  NoLegend() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_a + p_b + p_c + p_d + p_e + plot_layout(design = layout)
```

# Model Application to New Data

The trained model can be applied to independent datasets:

```{r model-application, eval=FALSE}
# Get trained model parameters (compatible with Seurat 4 and 5)
model_params <- Seurat::Misc(result, slot = "scPAS_para")
head(sort(model_params$Coefs, decreasing = TRUE))

# Apply to new bulk data
new_bulk_predictions <- scPAS.prediction(
  model = result,
  test.data = new_bulk_expression,
  do_imputation = FALSE
)

# Apply to spatial transcriptomics
spatial_predictions <- scPAS.prediction(
  model = result,
  test.data = spatial_seurat,
  assay = "Spatial",
  do_imputation = TRUE
)
```

# Key Takeaways

1. **scPAS+ cells** (associated with poor prognosis):
   - M2 tumor-associated macrophages (immunosuppressive)
   - Regulatory T cells (Tregs)
   - Cancer-associated fibroblasts
   - Exhausted CD8+ T cells

2. **scPAS- cells** (associated with good prognosis):
   - M1 macrophages (anti-tumor)
   - Effector CD8+ T cells
   - Dendritic cells

3. **Clinical implications**:
   - Target immunosuppressive populations for therapy
   - Enhance anti-tumor immune responses
   - Biomarker development for prognosis prediction

# Session Information

```{r session-info}
sessionInfo()
```