--- title: "Advanced Usage and Best Practices" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Advanced Usage and Best Practices} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ``` ## Introduction This vignette covers advanced features and best practices for optimal TorchDecon performance in real-world deconvolution tasks. **Author**: Zaoqu Liu (liuzaoqu@163.com) ## GPU Acceleration ### Checking GPU Availability ```{r gpu-check} library(TorchDecon) library(torch) # Check if CUDA is available if (torch::cuda_is_available()) { cat("CUDA GPU detected!\n") cat("Device count:", torch::cuda_device_count(), "\n") cat("Current device:", torch::cuda_current_device(), "\n") } else { cat("No CUDA GPU available. Using CPU.\n") } ``` ### Specifying Device ```{r device-selection} # Automatically detect best device ensemble <- CreateTorchDeconEnsemble( n_features = 5000, n_classes = 10, device = "auto" # Auto-select GPU if available ) # Force CPU usage (useful for debugging) ensemble_cpu <- CreateTorchDeconEnsemble( n_features = 5000, n_classes = 10, device = "cpu" ) # Force GPU usage (will error if no GPU) ensemble_gpu <- CreateTorchDeconEnsemble( n_features = 5000, n_classes = 10, device = "cuda" ) ``` ## Custom Model Architectures ### Creating Custom Networks ```{r custom-architecture} # Define a custom architecture custom_model <- CreateTorchDecon( n_features = 5000, n_classes = 10, architecture = "custom", hidden_units = c(512, 256, 128, 64), # Custom layer sizes dropout_rates = c(0.1, 0.2, 0.15, 0.1), # Custom dropout device = "auto" ) print(custom_model) ``` ### Architecture Selection Guidelines | Dataset Size | Recommended Architecture | Rationale | |--------------|-------------------------|-----------| | < 1000 genes | M256 | Prevents overfitting on small feature sets | | 1000-5000 genes | M512 | Balanced capacity | | > 5000 genes | M1024 or Custom | Sufficient capacity for complex patterns | | Limited samples | Lower dropout | Regularization already from small data | | Large samples | Higher dropout | Prevent overfitting | ## Training Optimization ### Early Stopping ```{r early-stopping} # Enable early stopping with validation ensemble <- TrainModel( model = ensemble, data = processed_data, num_steps = 10000, # Maximum steps validation_split = 0.1, # 10% for validation early_stopping = TRUE, # Enable early stopping patience = 500, # Steps without improvement verbose = TRUE ) ``` ### Learning Rate Tuning ```{r learning-rate} # Lower learning rate for stability ensemble_stable <- TrainModel( model = ensemble, data = processed_data, learning_rate = 5e-5, # Default is 1e-4 num_steps = 10000 ) # Higher learning rate for faster convergence (risky) ensemble_fast <- TrainModel( model = ensemble, data = processed_data, learning_rate = 5e-4, num_steps = 3000 ) ``` ### Batch Size Considerations ```{r batch-size} # Larger batch sizes: more stable gradients, faster (if GPU memory allows) ensemble_large_batch <- TrainModel( model = ensemble, data = processed_data, batch_size = 256, # Default is 128 num_steps = 5000 ) # Smaller batch sizes: more noise, can help escape local minima ensemble_small_batch <- TrainModel( model = ensemble, data = processed_data, batch_size = 32, num_steps = 5000 ) ``` ## Data Quality Optimization ### Handling Unknown Cell Types ```{r unknown-celltypes} # Merge rare or unknown cell types simulation <- SimulateBulk( object = seurat_obj, n_samples = 2000, celltype_col = "cell_type", unknown_celltypes = c("Doublets", "Unknown", "LowQuality"), # Merge these verbose = TRUE ) # These will be combined into a single "Unknown" category ``` ### Optimal Sample Simulation ```{r simulation-optimization} # High-quality simulation settings simulation <- SimulateBulk( object = seurat_obj, n_samples = 5000, # More samples = better generalization cells_per_sample = 200, # More cells = more realistic bulk sparse_fraction = 0.3, # Include incomplete compositions min_celltypes = 2, # At least 2 cell types per sparse sample seed = 42 # Reproducibility ) ``` ### Gene Selection Strategies ```{r gene-selection} # Stricter variance filtering processed_strict <- ProcessTrainingData( simulation = simulation, prediction_data = bulk_data, var_cutoff = 0.5, # Higher threshold = fewer genes scaling = "log_min_max" ) # More lenient (include more genes) processed_lenient <- ProcessTrainingData( simulation = simulation, prediction_data = bulk_data, var_cutoff = 0.01, # Lower threshold = more genes scaling = "log_min_max" ) cat("Strict filtering:", processed_strict$n_genes, "genes\n") cat("Lenient filtering:", processed_lenient$n_genes, "genes\n") ``` ## Working with Multiple Datasets ### Merging Simulations ```{r merge-simulations} # Create simulations from different tissues/conditions sim_tissue1 <- SimulateBulk(seurat_tissue1, n_samples = 1000, verbose = FALSE) sim_tissue2 <- SimulateBulk(seurat_tissue2, n_samples = 1000, verbose = FALSE) # Merge simulations combined_sim <- MergeSimulations(sim_tissue1, sim_tissue2) print(combined_sim) ``` ### Cross-validation Strategy ```{r cross-validation} # Implement k-fold cross-validation k <- 5 n_samples <- nrow(processed$X) fold_size <- ceiling(n_samples / k) cv_results <- list() for (i in 1:k) { # Define fold indices val_idx <- ((i-1) * fold_size + 1):min(i * fold_size, n_samples) train_idx <- setdiff(1:n_samples, val_idx) # Create training subset train_data <- list( X = processed$X[train_idx, ], Y = processed$Y[train_idx, ], genes = processed$genes, celltypes = processed$celltypes ) class(train_data) <- c("TorchDeconProcessed", "list") # Train model on this fold model <- CreateTorchDecon( n_features = ncol(train_data$X), n_classes = ncol(train_data$Y), architecture = "m256", device = "cpu" ) model <- TrainModel(model, train_data, num_steps = 2000, verbose = FALSE) # Evaluate on validation fold val_pred <- PredictFractions(model, t(processed$X[val_idx, ]), scaling = NULL, verbose = FALSE) cv_results[[i]] <- list( predictions = val_pred, true = processed$Y[val_idx, ] ) } # Aggregate CV results cv_performance <- sapply(cv_results, function(r) { cor(as.vector(as.matrix(r$predictions)), as.vector(r$true)) }) cat("Mean CV correlation:", mean(cv_performance), "\n") cat("SD:", sd(cv_performance), "\n") ``` ## Model Persistence and Deployment ### Saving Models with Metadata ```{r save-with-metadata} # Save trained model SaveModel(ensemble, "production_model", overwrite = TRUE) # The saved directory contains: # - network.pt (or m256/, m512/, m1024/ for ensemble) # - metadata.rds # - genes.txt # - celltypes.txt ``` ### Loading and Deploying ```{r deploy-model} # Load model for prediction loaded_model <- LoadModel("production_model", device = "auto") # Quick prediction pipeline QuickPredict( model_path = "production_model", bulk_data = "new_bulk_data.txt", output_file = "predictions.txt" ) ``` ## Troubleshooting ### Common Issues and Solutions | Issue | Cause | Solution | |-------|-------|----------| | Out of memory | Dataset too large | Reduce batch_size, use CPU | | Poor accuracy | Insufficient training | Increase num_steps, n_samples | | Predictions sum ≠ 1 | Numerical issues | Automatic normalization applied | | Slow training | No GPU | Install CUDA, use GPU | | Gene mismatch | Different gene sets | Ensure same gene names in reference and bulk | ### Memory Management ```{r memory-management} # Clear GPU memory after training if (torch::cuda_is_available()) { torch::cuda_empty_cache() } # Force garbage collection gc() ``` ## Reproducibility ### Setting Seeds ```{r reproducibility} # For fully reproducible results set.seed(42) torch::torch_manual_seed(42) # All TorchDecon functions support seed parameter simulation <- SimulateBulk(seurat_obj, n_samples = 1000, seed = 42) ensemble <- CreateTorchDeconEnsemble(n_features = 5000, n_classes = 10, seed = 42) ensemble <- TrainModel(ensemble, processed_data, seed = 42) ``` ## Best Practices Summary 1. **Data Quality**: Use high-quality scRNA-seq reference with accurate annotations 2. **Sample Size**: Generate 2000-5000 simulated samples for training 3. **GPU Usage**: Use GPU when available for faster training 4. **Validation**: Use early stopping with validation split 5. **Reproducibility**: Always set seeds for reproducible results 6. **Gene Filtering**: Start with default var_cutoff (0.1), adjust if needed 7. **Model Selection**: Use ensemble (default) for robust predictions 8. **Evaluation**: Always evaluate on held-out data if ground truth available --- **Package Author**: Zaoqu Liu **Contact**: liuzaoqu@163.com **GitHub**: https://github.com/Zaoqu-Liu/TorchDecon