Multi-Sample Analysis

Overview

Analyzing CNVs across multiple samples enables:

Cohort-level patterns: Identify recurrent CNV events
Sample comparison: Compare CNV profiles between patients
Batch integration: Combine samples with shared reference

This vignette demonstrates multi-sample workflows in fastCNV.

Preparing Multiple Samples

Loading Samples

library(fastCNV)
library(Seurat)

# Load individual Seurat objects
sample1 <- readRDS("patient_A.rds")
sample2 <- readRDS("patient_B.rds")
sample3 <- readRDS("patient_C.rds")

# Verify cell type annotations exist
table(sample1$cell_type)
table(sample2$cell_type)
table(sample3$cell_type)

Sample List Preparation

# Create a list of Seurat objects
sample_list <- list(sample1, sample2, sample3)

# Define sample names
sample_names <- c("Patient_A", "Patient_B", "Patient_C")

Running Multi-Sample Analysis

Basic Multi-Sample Workflow

# Run fastCNV on multiple samples
results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Fibroblast", "T_cell", "B_cell"),
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE,
  doPlot = TRUE
)

Pooled Reference Analysis

When samples share similar normal cell populations, use a pooled reference:

results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal_epithelial", "Fibroblast"),
  pooledReference = TRUE,  # Combine references across samples
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE
)

When to use pooled reference:

Scenario	Pooled Reference	Individual Reference
Same tissue type	✓ Recommended	✓ Also valid
Different tissues	✗ Avoid	✓ Recommended
Batch effects	✗ May amplify	✓ Reduces batch effects
Low reference cells	✓ Increases power	✗ May be noisy

Handling Results

Accessing Individual Results

# Results are returned as a list
length(results)  # Number of samples

# Access individual sample
patient_a <- results[[1]]
patient_b <- results[[2]]
patient_c <- results[[3]]

# Check CNV clusters per sample
table(patient_a$cnv_clusters)
table(patient_b$cnv_clusters)
table(patient_c$cnv_clusters)

Merging Results

# Merge all samples for combined analysis
merged <- merge(results[[1]], results[2:3])

# Add sample identifier
merged$sample <- merged$orig.ident

# Visualize combined data
DimPlot(merged, group.by = "cnv_clusters", split.by = "sample")

Cross-Sample Comparisons

Comparing CNV Patterns

library(dplyr)
library(ggplot2)

# Extract CNV fractions from all samples
cnv_summary <- lapply(seq_along(results), function(i) {
  data.frame(
    sample = sample_names[i],
    cnv_fraction = results[[i]]$cnv_fraction,
    cnv_cluster = results[[i]]$cnv_clusters
  )
}) %>% bind_rows()

# Compare CNV burden across samples
ggplot(cnv_summary, aes(x = sample, y = cnv_fraction, fill = sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "CNV Burden Comparison Across Samples",
    x = "Sample",
    y = "CNV Fraction"
  )

Chromosome Arm Comparison

# Extract arm-level CNVs
arm_summary <- lapply(seq_along(results), function(i) {
  arm_data <- results[[i]]@meta.data %>%
    select(starts_with("arm_")) %>%
    colMeans()
  data.frame(
    sample = sample_names[i],
    arm = names(arm_data),
    mean_cnv = arm_data
  )
}) %>% bind_rows()

# Heatmap of arm-level CNVs
library(tidyr)
arm_matrix <- arm_summary %>%
  pivot_wider(names_from = sample, values_from = mean_cnv) %>%
  column_to_rownames("arm") %>%
  as.matrix()

heatmap(arm_matrix, scale = "none", col = colorRampPalette(c("blue", "white", "red"))(100))

Identifying Recurrent CNVs

Finding Common Events

# Define CNV presence threshold
cnv_threshold <- 0.15

# Function to identify recurrent CNVs
identify_recurrent_cnvs <- function(results, threshold = 0.15) {
  # Get CNV calls per sample
  cnv_calls <- lapply(results, function(obj) {
    scores <- as.matrix(obj@assays$CNVScoresTrimmed@data)
    # Fraction of cells with CNV per window
    colMeans(abs(scores) > threshold)
  })
  
  # Combine across samples
  cnv_matrix <- do.call(cbind, cnv_calls)
  colnames(cnv_matrix) <- sample_names
  
  # Identify windows with CNV in multiple samples
  recurrent <- rowSums(cnv_matrix > 0.1) >= 2  # CNV in ≥2 samples
  
  list(
    matrix = cnv_matrix,
    recurrent_windows = which(recurrent)
  )
}

recurrent <- identify_recurrent_cnvs(results)
message("Recurrent CNV regions: ", length(recurrent$recurrent_windows))

Visualizing Recurrent CNVs

# Create recurrence plot
recurrence_data <- data.frame(
  window = 1:nrow(recurrent$matrix),
  n_samples = rowSums(recurrent$matrix > 0.1)
)

ggplot(recurrence_data, aes(x = window, y = n_samples)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 2, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(
    title = "CNV Recurrence Across Samples",
    x = "Genomic Window",
    y = "Number of Samples with CNV"
  )

Batch Effect Considerations

Detecting Batch Effects

# PCA on CNV profiles
cnv_profiles <- lapply(results, function(obj) {
  colMeans(as.matrix(obj@assays$CNVScores@data))
})
cnv_matrix <- do.call(rbind, cnv_profiles)
rownames(cnv_matrix) <- sample_names

# PCA
pca <- prcomp(cnv_matrix, scale = TRUE)
plot(pca$x[, 1:2], pch = 19, col = 1:length(sample_names))
text(pca$x[, 1:2], labels = sample_names, pos = 3)

Mitigating Batch Effects

# Strategy 1: Use sample-specific references (default)
results_individual <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal"),
  pooledReference = FALSE  # Each sample uses its own reference
)

# Strategy 2: Regress out batch in downstream analysis
# After merging, use Seurat integration methods

Cohort-Level Visualization

Combined Heatmap

# Merge samples for combined visualization
merged_results <- merge(results[[1]], results[2:3])

# Add sample labels
merged_results$sample_id <- factor(
  merged_results$orig.ident,
  levels = sample_names
)

# Plot combined heatmap (shows all samples)
plotCNVResults(
  seuratObj = merged_results,
  referenceVar = "cell_type",
  tumorLabel = "Tumor",
  splitPlotOnVar = "sample_id"
)

Sample-Wise CNV Tree

# Build tree for each sample
trees <- lapply(results, function(obj) {
  CNVTree(
    seuratObj = obj,
    referenceVar = "cell_type",
    tumorLabel = "Tumor",
    cnv_thresh = 0.15
  )
})

# Plot trees side by side
par(mfrow = c(1, 3))
for (i in seq_along(trees)) {
  plot(trees[[i]], main = sample_names[i])
}

Memory Management

Large Cohort Processing

# For large cohorts, process samples sequentially
process_sample <- function(seurat_path, sample_name, ref_var, ref_label) {
  # Load sample
  obj <- readRDS(seurat_path)
  
  # Run CNV analysis
  result <- fastCNV(
    seuratObj = obj,
    sampleName = sample_name,
    referenceVar = ref_var,
    referenceLabel = ref_label,
    prepareCounts = TRUE,
    getCNVPerChromosomeArm = TRUE,
    getCNVClusters = TRUE,
    doPlot = FALSE  # Skip plotting to save memory
  )
  
  # Save result
  saveRDS(result, paste0(sample_name, "_cnv_result.rds"))
  
  # Clean up
  rm(obj)
  invisible(gc())
  
  return(invisible(NULL))
}

# Process cohort
sample_paths <- c("patient_A.rds", "patient_B.rds", "patient_C.rds")
sample_names <- c("Patient_A", "Patient_B", "Patient_C")

for (i in seq_along(sample_paths)) {
  message("Processing: ", sample_names[i])
  process_sample(
    sample_paths[i],
    sample_names[i],
    ref_var = "cell_type",
    ref_label = c("Normal")
  )
}

Parallel Processing

library(future)
library(future.apply)

# Set up parallel backend
plan(multisession, workers = 4)

# Process samples in parallel
results <- future_lapply(seq_along(sample_list), function(i) {
  fastCNV(
    seuratObj = sample_list[[i]],
    sampleName = sample_names[i],
    referenceVar = "cell_type",
    referenceLabel = c("Normal"),
    doPlot = FALSE
  )
})

# Reset to sequential
plan(sequential)

Case Study: Tumor Cohort Analysis

Workflow Example

# 1. Load cohort data
cohort <- list(
  tumor_1 = readRDS("GBM_patient_1.rds"),
  tumor_2 = readRDS("GBM_patient_2.rds"),
  tumor_3 = readRDS("GBM_patient_3.rds")
)

# 2. Run CNV analysis with pooled reference
cnv_results <- fastCNV(
  seuratObj = cohort,
  sampleName = names(cohort),
  referenceVar = "cell_type",
  referenceLabel = c("Oligodendrocyte", "Astrocyte"),
  pooledReference = TRUE,
  windowSize = 150,
  getCNVClusters = TRUE
)

# 3. Identify recurrent GBM-associated CNVs
# Expected: chr7 gain, chr10 loss

# 4. Compare clonal composition across patients
clone_composition <- lapply(cnv_results, function(obj) {
  table(obj$cnv_clusters) / ncol(obj)
})

# 5. Export results
saveRDS(cnv_results, "GBM_cohort_cnv_results.rds")

Summary

Key points for multi-sample analysis:

Consistent annotations: Use the same cell type labels across samples
Reference selection: Choose appropriate reference strategy (pooled vs. individual)
Batch awareness: Consider technical variation between samples
Memory management: Process large cohorts sequentially or in parallel
Recurrence analysis: Identify biologically meaningful recurrent CNVs

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] rmarkdown_2.31
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.39    R6_2.6.1         fastmap_1.2.0    xfun_0.59       
#>  [5] maketools_1.3.2  cachem_1.1.0     knitr_1.51       htmltools_0.5.9 
#>  [9] buildtools_1.0.0 lifecycle_1.0.5  cli_3.6.6        sass_0.4.10     
#> [13] jquerylib_0.1.4  compiler_4.6.0   sys_3.4.3        tools_4.6.0     
#> [17] evaluate_1.0.5   bslib_0.11.0     yaml_2.3.12      otel_0.2.0      
#> [21] jsonlite_2.0.0   rlang_1.2.0