--- title: "Benchmark Analysis" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Benchmark Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE, eval = FALSE ) ``` ## Introduction This vignette demonstrates benchmarking of MOFSR algorithms on simulated multi-omics data with known ground truth. ## Simulation Framework We simulate multi-omics data with controlled cluster structure to evaluate: - **Clustering accuracy** (Adjusted Rand Index) - **Computational efficiency** (Runtime) - **Robustness** (Stability across runs) ```{r simulate-data} library(MOFSR) set.seed(42) # Simulation parameters n_samples <- 100 n_clusters <- 3 cluster_sizes <- rep(n_samples / n_clusters, n_clusters) # True cluster assignments true_clusters <- rep(1:n_clusters, times = cluster_sizes) # Generate multi-omics data with cluster structure generate_multiomics <- function(n, n_features, clusters, noise_level = 1) { K <- length(unique(clusters)) lapply(n_features, function(p) { # Cluster centers centers <- matrix(rnorm(K * p, sd = 2), K, p) # Generate samples data <- t(sapply(clusters, function(k) { centers[k, ] + rnorm(p, sd = noise_level) })) colnames(data) <- paste0("F", seq_len(p)) rownames(data) <- paste0("S", seq_len(n)) t(data) # Return features x samples }) } # Create datasets with different characteristics data_list <- generate_multiomics( n = n_samples, n_features = c(500, 300, 200), # mRNA, miRNA, methylation clusters = true_clusters, noise_level = 0.8 ) names(data_list) <- c("mRNA", "miRNA", "methylation") cat("Data dimensions:\n") sapply(data_list, dim) ``` --- ## Algorithm Benchmark ### Define Algorithms ```{r define-algos} # Algorithms to benchmark algorithms <- c( "SNF", # Similarity Network Fusion "RGCCA", # Regularized Generalized CCA "CPCA", # Consensus PCA "CIMLR", # Cancer Integration via Multi-kernel Learning "IntNMF", # Integrative NMF "MOFA", # Multi-Omics Factor Analysis "MCIA", # Multiple Co-Inertia Analysis "BCC", # Bayesian Consensus Clustering "PINSPlus", # Perturbation Clustering "NEMO" # Neighborhood-based Multi-omics ) ``` ### Run Benchmark ```{r run-benchmark} # Function to benchmark single algorithm benchmark_algorithm <- function(alg, data, n_clusters, true_labels) { start_time <- Sys.time() result <- tryCatch({ run_integration(data, algorithm = alg, n_clusters = n_clusters) }, error = function(e) { return(NULL) }) end_time <- Sys.time() runtime <- as.numeric(difftime(end_time, start_time, units = "secs")) if (is.null(result)) { return(data.frame( Algorithm = alg, ARI = NA, Runtime = runtime, Status = "Failed" )) } # Calculate ARI ari <- .adjusted_rand_index(result$Cluster, true_labels) data.frame( Algorithm = alg, ARI = round(ari, 4), Runtime = round(runtime, 2), Status = "Success" ) } # Run benchmark cat("Running benchmark...\n\n") benchmark_results <- do.call(rbind, lapply(algorithms, function(alg) { cat(sprintf("Testing %s... ", alg)) res <- benchmark_algorithm(alg, data_list, n_clusters, true_clusters) cat(sprintf("ARI = %.3f, Time = %.2fs\n", res$ARI, res$Runtime)) res })) # Display results print(benchmark_results) ``` ### Summary Statistics ```{r summary-stats} # Filter successful runs successful <- benchmark_results[benchmark_results$Status == "Success", ] # Summary cat("\n=== Benchmark Summary ===\n") cat(sprintf("Algorithms tested: %d\n", nrow(benchmark_results))) cat(sprintf("Successful: %d\n", nrow(successful))) cat(sprintf("Mean ARI: %.4f\n", mean(successful$ARI, na.rm = TRUE))) cat(sprintf("Best ARI: %.4f (%s)\n", max(successful$ARI, na.rm = TRUE), successful$Algorithm[which.max(successful$ARI)])) cat(sprintf("Fastest: %.2fs (%s)\n", min(successful$Runtime, na.rm = TRUE), successful$Algorithm[which.min(successful$Runtime)])) ``` --- ## Visualization ### ARI Comparison ```{r ari-plot, fig.cap="Clustering accuracy (ARI) across algorithms"} if (requireNamespace("ggplot2", quietly = TRUE)) { library(ggplot2) successful$Algorithm <- factor(successful$Algorithm, levels = successful$Algorithm[order(successful$ARI)]) ggplot(successful, aes(x = Algorithm, y = ARI, fill = ARI)) + geom_col(width = 0.7) + geom_text(aes(label = sprintf("%.3f", ARI)), hjust = -0.1, size = 3) + scale_fill_gradient(low = "#FEE0D2", high = "#CB181D") + coord_flip() + ylim(0, 1.1) + labs(title = "Clustering Accuracy (ARI)", subtitle = sprintf("n=%d samples, K=%d clusters", n_samples, n_clusters), x = "", y = "Adjusted Rand Index") + theme_minimal() + theme(legend.position = "none", plot.title = element_text(face = "bold")) } ``` ### Runtime Comparison ```{r runtime-plot, fig.cap="Computational runtime across algorithms"} if (requireNamespace("ggplot2", quietly = TRUE)) { successful$Algorithm <- factor(successful$Algorithm, levels = successful$Algorithm[order(-successful$Runtime)]) ggplot(successful, aes(x = Algorithm, y = Runtime, fill = Runtime)) + geom_col(width = 0.7) + geom_text(aes(label = sprintf("%.1fs", Runtime)), hjust = -0.1, size = 3) + scale_fill_gradient(low = "#DEEBF7", high = "#2171B5") + coord_flip() + labs(title = "Computational Runtime", x = "", y = "Time (seconds)") + theme_minimal() + theme(legend.position = "none", plot.title = element_text(face = "bold")) } ``` ### Trade-off Analysis ```{r tradeoff-plot, fig.cap="Accuracy vs. runtime trade-off"} if (requireNamespace("ggplot2", quietly = TRUE)) { ggplot(successful, aes(x = Runtime, y = ARI, label = Algorithm)) + geom_point(size = 4, color = "#E41A1C") + geom_text(vjust = -1, size = 3) + labs(title = "Accuracy vs. Runtime Trade-off", x = "Runtime (seconds)", y = "Adjusted Rand Index") + theme_minimal() + theme(plot.title = element_text(face = "bold")) } ``` --- ## Noise Sensitivity Analysis Test algorithm robustness under different noise levels. ```{r noise-analysis} noise_levels <- c(0.5, 1.0, 1.5, 2.0) test_algorithms <- c("SNF", "RGCCA", "CIMLR") noise_results <- do.call(rbind, lapply(noise_levels, function(noise) { # Generate data with specific noise level noisy_data <- generate_multiomics( n = n_samples, n_features = c(500, 300, 200), clusters = true_clusters, noise_level = noise ) names(noisy_data) <- names(data_list) do.call(rbind, lapply(test_algorithms, function(alg) { res <- benchmark_algorithm(alg, noisy_data, n_clusters, true_clusters) res$Noise <- noise res })) })) print(noise_results[, c("Algorithm", "Noise", "ARI")]) ``` ```{r noise-plot, fig.cap="Algorithm robustness to noise"} if (requireNamespace("ggplot2", quietly = TRUE)) { noise_df <- noise_results[!is.na(noise_results$ARI), ] ggplot(noise_df, aes(x = Noise, y = ARI, color = Algorithm, group = Algorithm)) + geom_line(linewidth = 1) + geom_point(size = 3) + scale_color_brewer(palette = "Set1") + labs(title = "Noise Sensitivity Analysis", x = "Noise Level (SD)", y = "Adjusted Rand Index") + theme_minimal() + theme(plot.title = element_text(face = "bold"), legend.position = "bottom") } ``` --- ## Sample Size Scaling Evaluate performance with increasing sample sizes. ```{r scaling-analysis} sample_sizes <- c(50, 100, 150) test_alg <- "SNF" scaling_results <- do.call(rbind, lapply(sample_sizes, function(n) { # Adjust cluster assignments clusters_n <- rep(1:3, each = n/3) # Generate data scaled_data <- generate_multiomics( n = n, n_features = c(500, 300, 200), clusters = clusters_n, noise_level = 0.8 ) names(scaled_data) <- names(data_list) res <- benchmark_algorithm(test_alg, scaled_data, 3, clusters_n) res$SampleSize <- n res })) print(scaling_results[, c("Algorithm", "SampleSize", "ARI", "Runtime")]) ``` --- ## Recommendations Based on benchmark results: | Scenario | Recommended Algorithm | Rationale | |:---------|:---------------------|:----------| | **Best accuracy** | Top performer from ARI | Highest clustering agreement with truth | | **Fast analysis** | SNF, RGCCA | Low computational cost | | **Noisy data** | CIMLR, PINSPlus | Robust to noise | | **Large samples** | SNF | Scales well with sample size | | **Feature selection** | CIMLR, SGCCA | Built-in feature ranking | ### Algorithm Selection Flowchart ``` Start │ ┌─────────────┴─────────────┐ │ Need feature selection? │ └─────────────┬─────────────┘ ┌───────┴───────┐ Yes No │ │ ┌─────┴─────┐ ┌─────┴─────┐ │ CIMLR │ │ n > 500? │ │ SGCCA │ └─────┬─────┘ └───────────┘ ┌─────┴─────┐ Yes No │ │ ┌─────┴───┐ ┌────┴────┐ │ SNF │ │ MOFA │ │ IntNMF │ │ RGCCA │ └─────────┘ │ MCIA │ └─────────┘ ``` --- ## Session Info ```{r session} sessionInfo() ```