---
title: "Multi-Sample Analysis"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Multi-Sample Analysis}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 10,
  fig.height = 7,
  eval = FALSE
)
```

## Overview

Analyzing CNVs across multiple samples enables:

- **Cohort-level patterns**: Identify recurrent CNV events
- **Sample comparison**: Compare CNV profiles between patients
- **Batch integration**: Combine samples with shared reference

This vignette demonstrates multi-sample workflows in fastCNV.

## Preparing Multiple Samples

### Loading Samples

```{r load-samples}
library(fastCNV)
library(Seurat)

# Load individual Seurat objects
sample1 <- readRDS("patient_A.rds")
sample2 <- readRDS("patient_B.rds")
sample3 <- readRDS("patient_C.rds")

# Verify cell type annotations exist
table(sample1$cell_type)
table(sample2$cell_type)
table(sample3$cell_type)
```

### Sample List Preparation

```{r prepare-list}
# Create a list of Seurat objects
sample_list <- list(sample1, sample2, sample3)

# Define sample names
sample_names <- c("Patient_A", "Patient_B", "Patient_C")
```

## Running Multi-Sample Analysis

### Basic Multi-Sample Workflow

```{r multi-sample-basic}
# Run fastCNV on multiple samples
results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Fibroblast", "T_cell", "B_cell"),
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE,
  doPlot = TRUE
)
```

### Pooled Reference Analysis

When samples share similar normal cell populations, use a pooled reference:

```{r pooled-reference}
results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal_epithelial", "Fibroblast"),
  pooledReference = TRUE,  # Combine references across samples
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE
)
```

**When to use pooled reference:**

| Scenario | Pooled Reference | Individual Reference |
|----------|------------------|----------------------|
| Same tissue type | ✓ Recommended | ✓ Also valid |
| Different tissues | ✗ Avoid | ✓ Recommended |
| Batch effects | ✗ May amplify | ✓ Reduces batch effects |
| Low reference cells | ✓ Increases power | ✗ May be noisy |

## Handling Results

### Accessing Individual Results

```{r access-results}
# Results are returned as a list
length(results)  # Number of samples

# Access individual sample
patient_a <- results[[1]]
patient_b <- results[[2]]
patient_c <- results[[3]]

# Check CNV clusters per sample
table(patient_a$cnv_clusters)
table(patient_b$cnv_clusters)
table(patient_c$cnv_clusters)
```

### Merging Results

```{r merge-results}
# Merge all samples for combined analysis
merged <- merge(results[[1]], results[2:3])

# Add sample identifier
merged$sample <- merged$orig.ident

# Visualize combined data
DimPlot(merged, group.by = "cnv_clusters", split.by = "sample")
```

## Cross-Sample Comparisons

### Comparing CNV Patterns

```{r compare-patterns}
library(dplyr)
library(ggplot2)

# Extract CNV fractions from all samples
cnv_summary <- lapply(seq_along(results), function(i) {
  data.frame(
    sample = sample_names[i],
    cnv_fraction = results[[i]]$cnv_fraction,
    cnv_cluster = results[[i]]$cnv_clusters
  )
}) %>% bind_rows()

# Compare CNV burden across samples
ggplot(cnv_summary, aes(x = sample, y = cnv_fraction, fill = sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "CNV Burden Comparison Across Samples",
    x = "Sample",
    y = "CNV Fraction"
  )
```

### Chromosome Arm Comparison

```{r arm-comparison}
# Extract arm-level CNVs
arm_summary <- lapply(seq_along(results), function(i) {
  arm_data <- results[[i]]@meta.data %>%
    select(starts_with("arm_")) %>%
    colMeans()
  data.frame(
    sample = sample_names[i],
    arm = names(arm_data),
    mean_cnv = arm_data
  )
}) %>% bind_rows()

# Heatmap of arm-level CNVs
library(tidyr)
arm_matrix <- arm_summary %>%
  pivot_wider(names_from = sample, values_from = mean_cnv) %>%
  column_to_rownames("arm") %>%
  as.matrix()

heatmap(arm_matrix, scale = "none", col = colorRampPalette(c("blue", "white", "red"))(100))
```

## Identifying Recurrent CNVs

### Finding Common Events

```{r recurrent-cnvs}
# Define CNV presence threshold
cnv_threshold <- 0.15

# Function to identify recurrent CNVs
identify_recurrent_cnvs <- function(results, threshold = 0.15) {
  # Get CNV calls per sample
  cnv_calls <- lapply(results, function(obj) {
    scores <- as.matrix(obj@assays$CNVScoresTrimmed@data)
    # Fraction of cells with CNV per window
    colMeans(abs(scores) > threshold)
  })
  
  # Combine across samples
  cnv_matrix <- do.call(cbind, cnv_calls)
  colnames(cnv_matrix) <- sample_names
  
  # Identify windows with CNV in multiple samples
  recurrent <- rowSums(cnv_matrix > 0.1) >= 2  # CNV in ≥2 samples
  
  list(
    matrix = cnv_matrix,
    recurrent_windows = which(recurrent)
  )
}

recurrent <- identify_recurrent_cnvs(results)
message("Recurrent CNV regions: ", length(recurrent$recurrent_windows))
```

### Visualizing Recurrent CNVs

```{r plot-recurrent}
# Create recurrence plot
recurrence_data <- data.frame(
  window = 1:nrow(recurrent$matrix),
  n_samples = rowSums(recurrent$matrix > 0.1)
)

ggplot(recurrence_data, aes(x = window, y = n_samples)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 2, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(
    title = "CNV Recurrence Across Samples",
    x = "Genomic Window",
    y = "Number of Samples with CNV"
  )
```

## Batch Effect Considerations

### Detecting Batch Effects

```{r batch-detection}
# PCA on CNV profiles
cnv_profiles <- lapply(results, function(obj) {
  colMeans(as.matrix(obj@assays$CNVScores@data))
})
cnv_matrix <- do.call(rbind, cnv_profiles)
rownames(cnv_matrix) <- sample_names

# PCA
pca <- prcomp(cnv_matrix, scale = TRUE)
plot(pca$x[, 1:2], pch = 19, col = 1:length(sample_names))
text(pca$x[, 1:2], labels = sample_names, pos = 3)
```

### Mitigating Batch Effects

```{r batch-mitigation}
# Strategy 1: Use sample-specific references (default)
results_individual <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal"),
  pooledReference = FALSE  # Each sample uses its own reference
)

# Strategy 2: Regress out batch in downstream analysis
# After merging, use Seurat integration methods
```

## Cohort-Level Visualization

### Combined Heatmap

```{r combined-heatmap}
# Merge samples for combined visualization
merged_results <- merge(results[[1]], results[2:3])

# Add sample labels
merged_results$sample_id <- factor(
  merged_results$orig.ident,
  levels = sample_names
)

# Plot combined heatmap (shows all samples)
plotCNVResults(
  seuratObj = merged_results,
  referenceVar = "cell_type",
  tumorLabel = "Tumor",
  splitPlotOnVar = "sample_id"
)
```

### Sample-Wise CNV Tree

```{r sample-trees}
# Build tree for each sample
trees <- lapply(results, function(obj) {
  CNVTree(
    seuratObj = obj,
    referenceVar = "cell_type",
    tumorLabel = "Tumor",
    cnv_thresh = 0.15
  )
})

# Plot trees side by side
par(mfrow = c(1, 3))
for (i in seq_along(trees)) {
  plot(trees[[i]], main = sample_names[i])
}
```

## Memory Management

### Large Cohort Processing

```{r memory-management}
# For large cohorts, process samples sequentially
process_sample <- function(seurat_path, sample_name, ref_var, ref_label) {
  # Load sample
  obj <- readRDS(seurat_path)
  
  # Run CNV analysis
  result <- fastCNV(
    seuratObj = obj,
    sampleName = sample_name,
    referenceVar = ref_var,
    referenceLabel = ref_label,
    prepareCounts = TRUE,
    getCNVPerChromosomeArm = TRUE,
    getCNVClusters = TRUE,
    doPlot = FALSE  # Skip plotting to save memory
  )
  
  # Save result
  saveRDS(result, paste0(sample_name, "_cnv_result.rds"))
  
  # Clean up
  rm(obj)
  invisible(gc())
  
  return(invisible(NULL))
}

# Process cohort
sample_paths <- c("patient_A.rds", "patient_B.rds", "patient_C.rds")
sample_names <- c("Patient_A", "Patient_B", "Patient_C")

for (i in seq_along(sample_paths)) {
  message("Processing: ", sample_names[i])
  process_sample(
    sample_paths[i],
    sample_names[i],
    ref_var = "cell_type",
    ref_label = c("Normal")
  )
}
```

### Parallel Processing

```{r parallel}
library(future)
library(future.apply)

# Set up parallel backend
plan(multisession, workers = 4)

# Process samples in parallel
results <- future_lapply(seq_along(sample_list), function(i) {
  fastCNV(
    seuratObj = sample_list[[i]],
    sampleName = sample_names[i],
    referenceVar = "cell_type",
    referenceLabel = c("Normal"),
    doPlot = FALSE
  )
})

# Reset to sequential
plan(sequential)
```

## Case Study: Tumor Cohort Analysis

### Workflow Example

```{r case-study}
# 1. Load cohort data
cohort <- list(
  tumor_1 = readRDS("GBM_patient_1.rds"),
  tumor_2 = readRDS("GBM_patient_2.rds"),
  tumor_3 = readRDS("GBM_patient_3.rds")
)

# 2. Run CNV analysis with pooled reference
cnv_results <- fastCNV(
  seuratObj = cohort,
  sampleName = names(cohort),
  referenceVar = "cell_type",
  referenceLabel = c("Oligodendrocyte", "Astrocyte"),
  pooledReference = TRUE,
  windowSize = 150,
  getCNVClusters = TRUE
)

# 3. Identify recurrent GBM-associated CNVs
# Expected: chr7 gain, chr10 loss

# 4. Compare clonal composition across patients
clone_composition <- lapply(cnv_results, function(obj) {
  table(obj$cnv_clusters) / ncol(obj)
})

# 5. Export results
saveRDS(cnv_results, "GBM_cohort_cnv_results.rds")
```

## Summary

Key points for multi-sample analysis:

1. **Consistent annotations**: Use the same cell type labels across samples
2. **Reference selection**: Choose appropriate reference strategy (pooled vs. individual)
3. **Batch awareness**: Consider technical variation between samples
4. **Memory management**: Process large cohorts sequentially or in parallel
5. **Recurrence analysis**: Identify biologically meaningful recurrent CNVs

## Session Info

```{r session-info, eval=TRUE}
sessionInfo()
```