--- title: "Multi-Sample Analysis" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Multi-Sample Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 10, fig.height = 7, eval = FALSE ) ``` ## Overview Analyzing CNVs across multiple samples enables: - **Cohort-level patterns**: Identify recurrent CNV events - **Sample comparison**: Compare CNV profiles between patients - **Batch integration**: Combine samples with shared reference This vignette demonstrates multi-sample workflows in fastCNV. ## Preparing Multiple Samples ### Loading Samples ```{r load-samples} library(fastCNV) library(Seurat) # Load individual Seurat objects sample1 <- readRDS("patient_A.rds") sample2 <- readRDS("patient_B.rds") sample3 <- readRDS("patient_C.rds") # Verify cell type annotations exist table(sample1$cell_type) table(sample2$cell_type) table(sample3$cell_type) ``` ### Sample List Preparation ```{r prepare-list} # Create a list of Seurat objects sample_list <- list(sample1, sample2, sample3) # Define sample names sample_names <- c("Patient_A", "Patient_B", "Patient_C") ``` ## Running Multi-Sample Analysis ### Basic Multi-Sample Workflow ```{r multi-sample-basic} # Run fastCNV on multiple samples results <- fastCNV( seuratObj = sample_list, sampleName = sample_names, referenceVar = "cell_type", referenceLabel = c("Fibroblast", "T_cell", "B_cell"), prepareCounts = TRUE, getCNVPerChromosomeArm = TRUE, getCNVClusters = TRUE, doPlot = TRUE ) ``` ### Pooled Reference Analysis When samples share similar normal cell populations, use a pooled reference: ```{r pooled-reference} results <- fastCNV( seuratObj = sample_list, sampleName = sample_names, referenceVar = "cell_type", referenceLabel = c("Normal_epithelial", "Fibroblast"), pooledReference = TRUE, # Combine references across samples prepareCounts = TRUE, getCNVPerChromosomeArm = TRUE, getCNVClusters = TRUE ) ``` **When to use pooled reference:** | Scenario | Pooled Reference | Individual Reference | |----------|------------------|----------------------| | Same tissue type | ✓ Recommended | ✓ Also valid | | Different tissues | ✗ Avoid | ✓ Recommended | | Batch effects | ✗ May amplify | ✓ Reduces batch effects | | Low reference cells | ✓ Increases power | ✗ May be noisy | ## Handling Results ### Accessing Individual Results ```{r access-results} # Results are returned as a list length(results) # Number of samples # Access individual sample patient_a <- results[[1]] patient_b <- results[[2]] patient_c <- results[[3]] # Check CNV clusters per sample table(patient_a$cnv_clusters) table(patient_b$cnv_clusters) table(patient_c$cnv_clusters) ``` ### Merging Results ```{r merge-results} # Merge all samples for combined analysis merged <- merge(results[[1]], results[2:3]) # Add sample identifier merged$sample <- merged$orig.ident # Visualize combined data DimPlot(merged, group.by = "cnv_clusters", split.by = "sample") ``` ## Cross-Sample Comparisons ### Comparing CNV Patterns ```{r compare-patterns} library(dplyr) library(ggplot2) # Extract CNV fractions from all samples cnv_summary <- lapply(seq_along(results), function(i) { data.frame( sample = sample_names[i], cnv_fraction = results[[i]]$cnv_fraction, cnv_cluster = results[[i]]$cnv_clusters ) }) %>% bind_rows() # Compare CNV burden across samples ggplot(cnv_summary, aes(x = sample, y = cnv_fraction, fill = sample)) + geom_boxplot() + theme_minimal() + labs( title = "CNV Burden Comparison Across Samples", x = "Sample", y = "CNV Fraction" ) ``` ### Chromosome Arm Comparison ```{r arm-comparison} # Extract arm-level CNVs arm_summary <- lapply(seq_along(results), function(i) { arm_data <- results[[i]]@meta.data %>% select(starts_with("arm_")) %>% colMeans() data.frame( sample = sample_names[i], arm = names(arm_data), mean_cnv = arm_data ) }) %>% bind_rows() # Heatmap of arm-level CNVs library(tidyr) arm_matrix <- arm_summary %>% pivot_wider(names_from = sample, values_from = mean_cnv) %>% column_to_rownames("arm") %>% as.matrix() heatmap(arm_matrix, scale = "none", col = colorRampPalette(c("blue", "white", "red"))(100)) ``` ## Identifying Recurrent CNVs ### Finding Common Events ```{r recurrent-cnvs} # Define CNV presence threshold cnv_threshold <- 0.15 # Function to identify recurrent CNVs identify_recurrent_cnvs <- function(results, threshold = 0.15) { # Get CNV calls per sample cnv_calls <- lapply(results, function(obj) { scores <- as.matrix(obj@assays$CNVScoresTrimmed@data) # Fraction of cells with CNV per window colMeans(abs(scores) > threshold) }) # Combine across samples cnv_matrix <- do.call(cbind, cnv_calls) colnames(cnv_matrix) <- sample_names # Identify windows with CNV in multiple samples recurrent <- rowSums(cnv_matrix > 0.1) >= 2 # CNV in ≥2 samples list( matrix = cnv_matrix, recurrent_windows = which(recurrent) ) } recurrent <- identify_recurrent_cnvs(results) message("Recurrent CNV regions: ", length(recurrent$recurrent_windows)) ``` ### Visualizing Recurrent CNVs ```{r plot-recurrent} # Create recurrence plot recurrence_data <- data.frame( window = 1:nrow(recurrent$matrix), n_samples = rowSums(recurrent$matrix > 0.1) ) ggplot(recurrence_data, aes(x = window, y = n_samples)) + geom_bar(stat = "identity", fill = "steelblue") + geom_hline(yintercept = 2, linetype = "dashed", color = "red") + theme_minimal() + labs( title = "CNV Recurrence Across Samples", x = "Genomic Window", y = "Number of Samples with CNV" ) ``` ## Batch Effect Considerations ### Detecting Batch Effects ```{r batch-detection} # PCA on CNV profiles cnv_profiles <- lapply(results, function(obj) { colMeans(as.matrix(obj@assays$CNVScores@data)) }) cnv_matrix <- do.call(rbind, cnv_profiles) rownames(cnv_matrix) <- sample_names # PCA pca <- prcomp(cnv_matrix, scale = TRUE) plot(pca$x[, 1:2], pch = 19, col = 1:length(sample_names)) text(pca$x[, 1:2], labels = sample_names, pos = 3) ``` ### Mitigating Batch Effects ```{r batch-mitigation} # Strategy 1: Use sample-specific references (default) results_individual <- fastCNV( seuratObj = sample_list, sampleName = sample_names, referenceVar = "cell_type", referenceLabel = c("Normal"), pooledReference = FALSE # Each sample uses its own reference ) # Strategy 2: Regress out batch in downstream analysis # After merging, use Seurat integration methods ``` ## Cohort-Level Visualization ### Combined Heatmap ```{r combined-heatmap} # Merge samples for combined visualization merged_results <- merge(results[[1]], results[2:3]) # Add sample labels merged_results$sample_id <- factor( merged_results$orig.ident, levels = sample_names ) # Plot combined heatmap (shows all samples) plotCNVResults( seuratObj = merged_results, referenceVar = "cell_type", tumorLabel = "Tumor", splitPlotOnVar = "sample_id" ) ``` ### Sample-Wise CNV Tree ```{r sample-trees} # Build tree for each sample trees <- lapply(results, function(obj) { CNVTree( seuratObj = obj, referenceVar = "cell_type", tumorLabel = "Tumor", cnv_thresh = 0.15 ) }) # Plot trees side by side par(mfrow = c(1, 3)) for (i in seq_along(trees)) { plot(trees[[i]], main = sample_names[i]) } ``` ## Memory Management ### Large Cohort Processing ```{r memory-management} # For large cohorts, process samples sequentially process_sample <- function(seurat_path, sample_name, ref_var, ref_label) { # Load sample obj <- readRDS(seurat_path) # Run CNV analysis result <- fastCNV( seuratObj = obj, sampleName = sample_name, referenceVar = ref_var, referenceLabel = ref_label, prepareCounts = TRUE, getCNVPerChromosomeArm = TRUE, getCNVClusters = TRUE, doPlot = FALSE # Skip plotting to save memory ) # Save result saveRDS(result, paste0(sample_name, "_cnv_result.rds")) # Clean up rm(obj) invisible(gc()) return(invisible(NULL)) } # Process cohort sample_paths <- c("patient_A.rds", "patient_B.rds", "patient_C.rds") sample_names <- c("Patient_A", "Patient_B", "Patient_C") for (i in seq_along(sample_paths)) { message("Processing: ", sample_names[i]) process_sample( sample_paths[i], sample_names[i], ref_var = "cell_type", ref_label = c("Normal") ) } ``` ### Parallel Processing ```{r parallel} library(future) library(future.apply) # Set up parallel backend plan(multisession, workers = 4) # Process samples in parallel results <- future_lapply(seq_along(sample_list), function(i) { fastCNV( seuratObj = sample_list[[i]], sampleName = sample_names[i], referenceVar = "cell_type", referenceLabel = c("Normal"), doPlot = FALSE ) }) # Reset to sequential plan(sequential) ``` ## Case Study: Tumor Cohort Analysis ### Workflow Example ```{r case-study} # 1. Load cohort data cohort <- list( tumor_1 = readRDS("GBM_patient_1.rds"), tumor_2 = readRDS("GBM_patient_2.rds"), tumor_3 = readRDS("GBM_patient_3.rds") ) # 2. Run CNV analysis with pooled reference cnv_results <- fastCNV( seuratObj = cohort, sampleName = names(cohort), referenceVar = "cell_type", referenceLabel = c("Oligodendrocyte", "Astrocyte"), pooledReference = TRUE, windowSize = 150, getCNVClusters = TRUE ) # 3. Identify recurrent GBM-associated CNVs # Expected: chr7 gain, chr10 loss # 4. Compare clonal composition across patients clone_composition <- lapply(cnv_results, function(obj) { table(obj$cnv_clusters) / ncol(obj) }) # 5. Export results saveRDS(cnv_results, "GBM_cohort_cnv_results.rds") ``` ## Summary Key points for multi-sample analysis: 1. **Consistent annotations**: Use the same cell type labels across samples 2. **Reference selection**: Choose appropriate reference strategy (pooled vs. individual) 3. **Batch awareness**: Consider technical variation between samples 4. **Memory management**: Process large cohorts sequentially or in parallel 5. **Recurrence analysis**: Identify biologically meaningful recurrent CNVs ## Session Info ```{r session-info, eval=TRUE} sessionInfo() ```