--- title: "Advanced Usage" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Advanced Usage} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE, eval = FALSE ) ``` ## Introduction This vignette covers advanced features of MOFSR including parallel computing, custom pipelines, and integration with other packages. ```{r load} library(MOFSR) set.seed(42) # Generate test data n_samples <- 60 true_clusters <- rep(1:3, each = 20) generate_omics <- function(n, p, clusters) { n_clusters <- length(unique(clusters)) centers <- matrix(rnorm(n_clusters * p, sd = 2), n_clusters, p) data <- t(sapply(clusters, function(k) { centers[k, ] + rnorm(p, sd = 1) })) colnames(data) <- paste0("F", seq_len(p)) rownames(data) <- paste0("S", seq_len(n)) return(t(data)) } data_list <- list( mRNA = generate_omics(n_samples, 500, true_clusters), miRNA = generate_omics(n_samples, 200, true_clusters), protein = generate_omics(n_samples, 100, true_clusters) ) ``` --- ## Parallel Computing MOFSR supports parallel computing via the `future` framework. ### Setup Parallel Backend ```{r parallel-setup, eval=FALSE} # Setup parallel processing (4 workers) setup_parallel(workers = 4) # Check status cat("Parallel workers:", future::nbrOfWorkers(), "\n") ``` ### Parallel Consensus Clustering ```{r parallel-cc, eval=FALSE} # Run consensus clustering in parallel cc_result <- parallel_consensus_cluster( data_list$mRNA, max_k = 6, n_reps = 100, n_cores = 4 ) ``` ### Parallel Feature Selection ```{r parallel-fs, eval=FALSE} # Bootstrap feature selection with parallel processing features <- parallel_bootstrap_features( data_list$mRNA, labels = true_clusters, n_bootstrap = 100, n_cores = 4 ) ``` ### Cleanup ```{r parallel-stop, eval=FALSE} # Always stop parallel workers when done stop_parallel() ``` --- ## Data Preprocessing Pipeline ### Complete Preprocessing Workflow ```{r preprocessing} # Step 1: Check sample alignment alignment <- check_sample_alignment(data_list) cat("All samples aligned:", alignment$aligned, "\n") # Step 2: Quality control summary qc <- qc_summary(data_list) print(qc) # Step 3: Normalize data data_norm <- normalize_omics(data_list, method = "zscore") # Step 4: Filter low-variance features data_filtered <- filter_low_variance(data_norm, min_var = 0.01) cat("\nFeatures after filtering:\n") sapply(data_filtered, nrow) ``` ### Handling Missing Values ```{r missing-values} # Simulate missing values data_with_na <- data_list data_with_na$mRNA[sample(length(data_with_na$mRNA), 50)] <- NA # Check missing values cat("Missing values in mRNA:", sum(is.na(data_with_na$mRNA)), "\n") # Impute using KNN data_imputed <- handle_missing(data_with_na, method = "knn", k = 5) cat("After imputation:", sum(is.na(data_imputed$mRNA)), "\n") ``` ### Batch Effect Correction ```{r batch-correction} # Simulate batch labels batch <- rep(c("Batch1", "Batch2"), each = 30) # Correct batch effects data_corrected <- correct_batch(data_list, batch = batch, method = "mean_center") ``` --- ## Custom Analysis Pipelines ### Multi-Algorithm Comparison ```{r multi-algo} # Define algorithms to compare algorithms <- c("SNF", "RGCCA", "CPCA", "MOFA") # Run all algorithms results <- lapply(algorithms, function(alg) { tryCatch({ run_integration(data_list, algorithm = alg, n_clusters = 3) }, error = function(e) { message(sprintf("Algorithm %s failed: %s", alg, e$message)) return(NULL) }) }) names(results) <- algorithms # Remove failed algorithms results <- Filter(Negate(is.null), results) # Compare results if (length(results) >= 2) { ari_matrix <- compare_clusterings(results) print(round(ari_matrix, 3)) } ``` ### Optimal K Selection ```{r optimal-k} # Run consensus clustering cc_result <- consensus_cluster(data_list$mRNA, maxK = 6, reps = 50, seed = 42) # Calculate PAC for each K pac_values <- calc_pac(cc_result) # Find optimal K (minimum PAC) optimal_k <- pac_values$K[which.min(pac_values$PAC)] cat("Optimal K by PAC:", optimal_k, "\n") ``` ### Ensemble Clustering ```{r ensemble} # Run multiple algorithms alg_results <- list( SNF = run_snf(data_list, n_clusters = 3), RGCCA = run_rgcca(data_list, n_clusters = 3), CPCA = run_cpca(data_list, n_clusters = 3) ) # Create ensemble by majority voting all_clusters <- sapply(alg_results, function(x) x$Cluster) # Majority vote (mode) get_mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } ensemble_clusters <- apply(all_clusters, 1, get_mode) cat("Ensemble clustering results:\n") table(ensemble_clusters) ``` --- ## Feature Selection ### Bootstrap-Based Selection ```{r feature-selection} # Feature selection with bootstrap selected <- FeatureSelectionWithBootstrap( data_list$mRNA, labels = true_clusters, n_bootstrap = 50, top_n = 100 ) cat("Selected features:", length(selected$features), "\n") cat("Top 10 features:", head(selected$features, 10), "\n") ``` ### MAD-Based Filtering ```{r mad-filter} # Filter by Median Absolute Deviation data_mad <- filter_by_mad(data_list, top_n = 500) cat("Features after MAD filtering:\n") sapply(data_mad, nrow) ``` --- ## Classification ### Train Subtype Classifier ```{r classifier} # Split data into training and test sets train_idx <- sample(n_samples, 40) test_idx <- setdiff(1:n_samples, train_idx) train_data <- data_list$mRNA[, train_idx] test_data <- data_list$mRNA[, test_idx] train_labels <- true_clusters[train_idx] test_labels <- true_clusters[test_idx] ``` ```{r classifier-run, eval=FALSE} # Train classifier (requires optional dependencies) init(classifier = TRUE) model <- RunClassifier( train_data = t(train_data), train_labels = train_labels, test_data = t(test_data), method = "RF" ) # Evaluate accuracy <- mean(model$predictions == test_labels) cat("Test accuracy:", round(accuracy, 3), "\n") ``` ### Ensemble Classifier ```{r ensemble-classifier, eval=FALSE} # Run ensemble of classifiers ensemble_result <- RunEnsemble( train_data = t(train_data), train_labels = train_labels, test_data = t(test_data), methods = c("RF", "SVM", "XGBoost") ) ``` --- ## Low-Level API Access ### Direct Algorithm Functions ```{r low-level} # Direct access to SNF components W1 <- snf_affinity_matrix(data_list$mRNA, K = 15) W2 <- snf_affinity_matrix(data_list$miRNA, K = 15) W3 <- snf_affinity_matrix(data_list$protein, K = 15) # Custom fusion W_fused <- snf_fuse(list(W1, W2, W3), K = 15, t = 20) # Spectral clustering on fused network clusters <- spectral_clustering(W_fused, n_clusters = 3) ``` ### Factor Analysis Components ```{r factor-analysis} # Run factor analysis fa_result <- multi_view_factor_analysis(data_list, n_factors = 5, max_iter = 100) # Access components cat("Factor dimensions:", dim(fa_result$Z), "\n") cat("Variance explained:", round(fa_result$variance_explained, 3), "\n") # Use factors for clustering factor_clusters <- kmeans(t(fa_result$Z), centers = 3)$cluster ``` --- ## Integration with Bioconductor ### Gene Set Analysis ```{r gsva, eval=FALSE} # Initialize GSVA dependencies init(gsva = TRUE) # Run GSVA gsva_result <- RunGSVA( expr = data_list$mRNA, gene_sets = gene_sets, # Built-in gene sets method = "gsva" ) ``` ### Survival Analysis ```{r survival, eval=FALSE} # Initialize survival dependencies init(survival = TRUE) # Simulate survival data time <- rexp(n_samples, rate = 0.1) event <- sample(0:1, n_samples, replace = TRUE) # Plot survival curves by cluster plot_survival(time, event, results$SNF, title = "Overall Survival", risk_table = TRUE) ``` --- ## Performance Tips ### Memory Management ```{r memory-tips, eval=FALSE} # For large datasets, process in chunks chunk_size <- 1000 process_chunk <- function(data, start, end) { chunk <- data[start:min(end, nrow(data)), ] # Process chunk... return(result) } # Use sparse matrices when appropriate library(Matrix) sparse_data <- Matrix(data_list$mRNA, sparse = TRUE) ``` ### Algorithm Selection by Data Size | Samples | Features | Recommended | |:--------|:---------|:------------| | < 100 | < 1000 | Any algorithm | | 100-500 | 1000-10000 | SNF, RGCCA, IntNMF | | > 500 | > 10000 | SNF, CIMLR (with feature selection) | ### Parallelization Strategy ```{r parallel-strategy, eval=FALSE} # For bootstrap/consensus: parallelize across iterations setup_parallel(workers = min(parallel::detectCores() - 1, 8)) # For multiple algorithms: parallelize across algorithms results <- parallel::mclapply(algorithms, function(alg) { run_integration(data_list, algorithm = alg, n_clusters = 3) }, mc.cores = 4) ``` --- ## Troubleshooting ### Common Issues 1. **Memory errors**: Reduce feature count with `filter_low_variance()` or `filter_by_mad()` 2. **Slow convergence**: Reduce `max_iter` or use faster algorithms (SNF, RGCCA) 3. **Inconsistent results**: Set `seed` parameter for reproducibility 4. **Missing dependencies**: Run `init()` to install optional packages ### Debug Mode ```{r debug, eval=FALSE} # Enable verbose output options(MOFSR.verbose = TRUE) # Trace function calls trace("run_snf", tracer = quote(cat("Running SNF...\n"))) ``` --- ## Session Info ```{r session} sessionInfo() ```