--- title: "Advanced Usage" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Advanced Usage} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, eval = FALSE ) ``` ## Introduction This vignette covers advanced usage patterns for **recall**, including: - Choosing the optimal null distribution - Parallel computing configurations - Count splitting alternative - Custom parameter tuning - Integration with downstream analyses ## Null Distribution Selection ### Available Methods **recall** supports multiple methods for generating synthetic null variables: | Method | Distribution | Correlations | Speed | Best For | |--------|-------------|--------------|-------|----------| | `ZIP` | Zero-Inflated Poisson | No | Fast | Most scRNA-seq data | | `NB` | Negative Binomial | No | Fast | Overdispersed data | | `ZIP-copula` | ZIP + Gaussian copula | Yes | Slow | Correlated genes | | `NB-copula` | NB + Gaussian copula | Yes | Slow | Complex datasets | | `Poisson-copula` | Poisson + copula | Yes | Medium | Low-variance data | | `Gaussian-copula` | Gaussian + copula | Yes | Medium | Normalized data | ### Choosing the Right Method ```{r null-method-selection} library(recall) library(Seurat) # For standard scRNA-seq data with excess zeros pbmc_zip <- FindClustersRecall( pbmc, null_method = "ZIP", # Default, fastest resolution_start = 0.8 ) # For highly overdispersed data pbmc_nb <- FindClustersRecall( pbmc, null_method = "NB", resolution_start = 0.8 ) # For data with strong gene-gene correlations pbmc_copula <- FindClustersRecall( pbmc, null_method = "NB-copula", cores = 4, # Parallel processing recommended resolution_start = 0.8 ) ``` ### Assessing Data Characteristics ```{r data-assessment} # Check for overdispersion counts <- GetAssayData(pbmc, layer = "counts") means <- rowMeans(counts) vars <- apply(counts, 1, var) # Overdispersion ratio overdispersion_ratio <- vars / means summary(overdispersion_ratio) # If median ratio >> 1, use NB instead of ZIP if (median(overdispersion_ratio, na.rm = TRUE) > 2) { message("Data appears overdispersed. Consider using null_method = 'NB'") } ``` ## Parallel Computing ### Cross-Platform Configuration **recall** automatically handles parallelization across platforms: - **Linux/macOS**: Uses `multicore` (fork-based) - **Windows**: Uses `multisession` (socket-based) ```{r parallel-config} # Automatic platform detection pbmc <- FindClustersRecall( pbmc, cores = parallel::detectCores() - 1, # Leave one core free shared_memory_max = 16000 * 1024^2 # 16GB for large datasets ) ``` ### Memory Management For large datasets, adjust memory settings: ```{r memory-management} # For datasets with >100,000 cells pbmc_large <- FindClustersRecall( large_seurat_obj, cores = 8, shared_memory_max = 32000 * 1024^2, # 32GB null_method = "ZIP" # Use faster method for large data ) ``` ## Count Splitting Alternative **recall** also implements the count splitting approach as an alternative to knockoff filtering: ```{r count-splitting} # Count splitting method (Neufeld et al., 2022) pbmc_countsplit <- FindClustersCountsplit( pbmc, resolution_start = 0.8, algorithm = "leiden" ) ``` ### Method Comparison ```{r method-comparison, fig.width=12, fig.height=5} library(patchwork) # Run both methods pbmc_recall <- FindClustersRecall(pbmc, resolution_start = 0.8) pbmc_countsplit <- FindClustersCountsplit(pbmc, resolution_start = 0.8) # Compare results p1 <- DimPlot(pbmc_recall, label = TRUE) + ggtitle(paste0("recall (", length(unique(Idents(pbmc_recall))), " clusters)")) p2 <- DimPlot(pbmc_countsplit, group.by = "countsplit_clusters", label = TRUE) + ggtitle(paste0("Count Split (", length(unique(pbmc_countsplit$countsplit_clusters)), " clusters)")) p1 + p2 ``` ## Resolution Tuning ### Iterative Resolution Reduction The `reduction_percentage` parameter controls how quickly resolution decreases: ```{r resolution-tuning} # Conservative: Slower reduction, may result in more clusters pbmc_conservative <- FindClustersRecall( pbmc, resolution_start = 1.2, reduction_percentage = 0.1 # 10% reduction per iteration ) # Aggressive: Faster reduction, may result in fewer clusters pbmc_aggressive <- FindClustersRecall( pbmc, resolution_start = 1.2, reduction_percentage = 0.3 # 30% reduction per iteration ) ``` ### Finding Optimal Starting Resolution ```{r optimal-resolution} # Sweep different starting resolutions resolutions <- c(0.4, 0.6, 0.8, 1.0, 1.2, 1.5) results <- list() for (res in resolutions) { pbmc_temp <- FindClustersRecall(pbmc, resolution_start = res) results[[as.character(res)]] <- list( n_clusters = length(unique(Idents(pbmc_temp))), clusters = Idents(pbmc_temp) ) } # Summary sapply(results, function(x) x$n_clusters) ``` ## Clustering Algorithm Selection ### Louvain vs Leiden ```{r algorithm-selection} # Louvain (default, faster) pbmc_louvain <- FindClustersRecall( pbmc, algorithm = "louvain", resolution_start = 0.8 ) # Leiden (often finds better partitions) pbmc_leiden <- FindClustersRecall( pbmc, algorithm = "leiden", resolution_start = 0.8 ) ``` ## Integration with Seurat Workflow ### Complete Analysis Pipeline ```{r complete-pipeline} library(Seurat) library(recall) # 1. Load and preprocess seurat_obj <- CreateSeuratObject(counts = raw_counts) seurat_obj <- NormalizeData(seurat_obj) seurat_obj <- FindVariableFeatures(seurat_obj, nfeatures = 2000) seurat_obj <- ScaleData(seurat_obj) seurat_obj <- RunPCA(seurat_obj) seurat_obj <- FindNeighbors(seurat_obj, dims = 1:20) seurat_obj <- RunUMAP(seurat_obj, dims = 1:20) # 2. Calibrated clustering with recall seurat_obj <- FindClustersRecall( seurat_obj, resolution_start = 0.8, null_method = "ZIP", algorithm = "leiden", cores = 4 ) # 3. Downstream analysis markers <- FindAllMarkers(seurat_obj, only.pos = TRUE) # 4. Visualization DimPlot(seurat_obj, group.by = "recall_clusters", label = TRUE) ``` ### Using seurat_workflow Helper ```{r seurat-workflow-helper} # One-line preprocessing + clustering seurat_obj <- seurat_workflow( raw_seurat_obj, num_variable_features = 2000, resolution_param = 0.8, visualization_method = "both", # UMAP and t-SNE num_dims = 15, algorithm = "louvain" ) ``` ## Batch Processing Multiple Samples ```{r batch-processing} # Process multiple samples sample_list <- list(sample1 = seurat1, sample2 = seurat2, sample3 = seurat3) results <- lapply(sample_list, function(obj) { # Preprocess obj <- NormalizeData(obj) obj <- FindVariableFeatures(obj) obj <- ScaleData(obj) obj <- RunPCA(obj) obj <- FindNeighbors(obj, dims = 1:10) # Calibrated clustering obj <- FindClustersRecall(obj, resolution_start = 0.8, cores = 2) return(obj) }) ``` ## Troubleshooting ### Common Issues **1. Memory errors with copula methods:** ```{r memory-fix} # Reduce features or use simpler method seurat_obj <- FindVariableFeatures(seurat_obj, nfeatures = 1000) # Reduce features pbmc <- FindClustersRecall(pbmc, null_method = "ZIP") # Use faster method ``` **2. Single cluster warning:** ```{r single-cluster-fix} # Increase starting resolution pbmc <- FindClustersRecall(pbmc, resolution_start = 1.5) ``` **3. Slow performance:** ```{r performance-fix} # Use parallel processing and simpler null method pbmc <- FindClustersRecall( pbmc, null_method = "ZIP", cores = parallel::detectCores() - 1 ) ``` ## Session Information ```{r session-info} sessionInfo() ```