--- title: "Performance Benchmarks" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 fig_caption: true vignette: > %\VignetteIndexEntry{Performance Benchmarks} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ``` ## Introduction This vignette demonstrates darwin's performance characteristics and provides benchmarks for different problem sizes and configurations. Understanding these benchmarks helps users make informed decisions about parameter settings for their specific use cases. ```{r load} library(darwin) library(ggplot2) set.seed(42) ``` ## Performance Architecture darwin achieves high performance through several design choices: 1. **Vectorized R operations** for matrix computations 2. **C++ implementations** (via RcppArmadillo) for critical operations 3. **Parallel processing** support via the `future` package 4. **Efficient memory management** using sparse representations where appropriate ### C++ vs R Performance ```{r cpp-benchmark, fig.cap="Performance comparison between R and C++ implementations."} # Create test data n_ct <- 10 n_genes <- 100 test_data <- matrix(runif(n_ct * n_genes), nrow = n_ct) # Compare implementations n_iter <- 50 times_r <- numeric(n_iter) times_cpp <- numeric(n_iter) for (i in 1:n_iter) { t1 <- system.time({ compute_correlation(test_data, use_cpp = FALSE) }) times_r[i] <- t1["elapsed"] t2 <- system.time({ compute_correlation(test_data, use_cpp = TRUE) }) times_cpp[i] <- t2["elapsed"] } df_bench <- data.frame( Time = c(times_r, times_cpp) * 1000, Implementation = rep(c("R (vectorized)", "C++ (RcppArmadillo)"), each = n_iter) ) ggplot(df_bench, aes(x = Implementation, y = Time, fill = Implementation)) + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c("#e74c3c", "#3498db")) + labs( title = "R vs C++ Implementation Performance", subtitle = paste("Computing correlation for", n_ct, "× ", n_genes, "matrix"), x = "", y = "Time (milliseconds)" ) + theme_minimal(base_size = 12) + theme(legend.position = "none") ``` ## Scaling with Problem Size ### Number of Genes ```{r scale-genes, fig.cap="Performance scaling with number of genes."} gene_sizes <- c(100, 200, 500, 1000, 2000) n_ct <- 8 results <- list() for (n_genes in gene_sizes) { test_data <- matrix(runif(n_ct * n_genes), nrow = n_ct) # Time correlation computation t <- system.time({ for (j in 1:20) { compute_correlation(test_data, use_cpp = TRUE) } }) results[[length(results) + 1]] <- data.frame( n_genes = n_genes, time = t["elapsed"] / 20 * 1000 ) } df_scale_genes <- do.call(rbind, results) ggplot(df_scale_genes, aes(x = n_genes, y = time)) + geom_line(color = "#3498db", linewidth = 1) + geom_point(color = "#3498db", size = 3) + scale_x_log10() + scale_y_log10() + labs( title = "Performance Scaling with Gene Count", subtitle = paste(n_ct, "cell types"), x = "Number of Genes (log scale)", y = "Time per iteration (ms, log scale)" ) + theme_minimal(base_size = 12) ``` ### Number of Cell Types ```{r scale-celltypes, fig.cap="Performance scaling with number of cell types."} ct_sizes <- c(3, 5, 10, 20, 30) n_genes <- 500 results_ct <- list() for (n_ct in ct_sizes) { test_data <- matrix(runif(n_ct * n_genes), nrow = n_ct) t <- system.time({ for (j in 1:20) { compute_correlation(test_data, use_cpp = TRUE) } }) results_ct[[length(results_ct) + 1]] <- data.frame( n_ct = n_ct, time = t["elapsed"] / 20 * 1000 ) } df_scale_ct <- do.call(rbind, results_ct) ggplot(df_scale_ct, aes(x = n_ct, y = time)) + geom_line(color = "#e74c3c", linewidth = 1) + geom_point(color = "#e74c3c", size = 3) + labs( title = "Performance Scaling with Cell Type Count", subtitle = paste(n_genes, "genes"), x = "Number of Cell Types", y = "Time per iteration (ms)" ) + theme_minimal(base_size = 12) ``` ## Optimization Benchmarks ### Complete Workflow Timing ```{r workflow-benchmark, fig.cap="Time breakdown for a complete optimization workflow."} # Setup n_ct <- 8 n_genes <- 1000 reference <- matrix(abs(rnorm(n_ct * n_genes, 2)), nrow = n_ct) rownames(reference) <- paste0("CT", 1:n_ct) colnames(reference) <- paste0("Gene", 1:n_genes) # Benchmark components timings <- list() # Initialization t_init <- system.time({ dw <- darwin(reference) }) timings$Initialization <- t_init["elapsed"] # Optimization (short run for benchmark) t_opt <- system.time({ dw$optimize(ngen = 20, pop_size = 50, verbose = FALSE, parallel = FALSE) }) timings$Optimization <- t_opt["elapsed"] # Selection t_sel <- system.time({ dw$select(weights = c(-1, 1)) }) timings$Selection <- t_sel["elapsed"] # Create timing data frame df_timing <- data.frame( Step = names(timings), Time = unlist(timings) ) df_timing$Percent <- df_timing$Time / sum(df_timing$Time) * 100 ggplot(df_timing, aes(x = reorder(Step, Time), y = Time, fill = Step)) + geom_bar(stat = "identity", alpha = 0.8) + geom_text(aes(label = paste0(round(Percent, 1), "%")), hjust = -0.1) + coord_flip() + scale_fill_brewer(palette = "Set2") + labs( title = "Workflow Time Breakdown", subtitle = paste(n_genes, "genes,", n_ct, "cell types, 20 generations"), x = "", y = "Time (seconds)" ) + theme_minimal(base_size = 12) + theme(legend.position = "none") + expand_limits(y = max(df_timing$Time) * 1.2) ``` ### Scaling with Generations ```{r scale-generations, fig.cap="Linear scaling with number of generations."} gen_sizes <- c(10, 25, 50, 100) results_gen <- list() for (ngen in gen_sizes) { t <- system.time({ dw <- darwin(reference) dw$optimize(ngen = ngen, pop_size = 50, verbose = FALSE, parallel = FALSE) }) results_gen[[length(results_gen) + 1]] <- data.frame( generations = ngen, time = t["elapsed"] ) } df_scale_gen <- do.call(rbind, results_gen) # Fit linear model lm_fit <- lm(time ~ generations, data = df_scale_gen) ggplot(df_scale_gen, aes(x = generations, y = time)) + geom_line(color = "#3498db", linewidth = 1) + geom_point(color = "#3498db", size = 3) + geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "gray50") + labs( title = "Scaling with Number of Generations", subtitle = paste("~", round(coef(lm_fit)[2], 3), "seconds per generation"), x = "Number of Generations", y = "Total Time (seconds)" ) + theme_minimal(base_size = 12) ``` ### Scaling with Population Size ```{r scale-pop, fig.cap="Performance scaling with population size."} pop_sizes <- c(30, 50, 80, 100, 150) results_pop <- list() for (pop in pop_sizes) { t <- system.time({ dw <- darwin(reference) dw$optimize(ngen = 30, pop_size = pop, verbose = FALSE, parallel = FALSE) }) results_pop[[length(results_pop) + 1]] <- data.frame( pop_size = pop, time = t["elapsed"] ) } df_scale_pop <- do.call(rbind, results_pop) ggplot(df_scale_pop, aes(x = pop_size, y = time)) + geom_line(color = "#e74c3c", linewidth = 1) + geom_point(color = "#e74c3c", size = 3) + labs( title = "Scaling with Population Size", subtitle = "30 generations", x = "Population Size", y = "Total Time (seconds)" ) + theme_minimal(base_size = 12) ``` ## Objective Function Comparison ```{r obj-comparison, fig.cap="Comparison of objective function computation times."} test_data <- matrix(runif(10 * 500), nrow = 10) n_iter <- 100 obj_times <- data.frame( Objective = character(), Time = numeric(), stringsAsFactors = FALSE ) # Correlation t <- system.time({ for (i in 1:n_iter) compute_correlation(test_data, use_cpp = TRUE) }) obj_times <- rbind(obj_times, data.frame(Objective = "Correlation", Time = t["elapsed"] / n_iter * 1000)) # Distance t <- system.time({ for (i in 1:n_iter) compute_distance(test_data, use_cpp = TRUE) }) obj_times <- rbind(obj_times, data.frame(Objective = "Distance", Time = t["elapsed"] / n_iter * 1000)) # Condition t <- system.time({ for (i in 1:n_iter) compute_condition(test_data, use_cpp = TRUE) }) obj_times <- rbind(obj_times, data.frame(Objective = "Condition", Time = t["elapsed"] / n_iter * 1000)) ggplot(obj_times, aes(x = reorder(Objective, Time), y = Time, fill = Objective)) + geom_bar(stat = "identity", alpha = 0.8) + coord_flip() + scale_fill_brewer(palette = "Set2") + labs( title = "Objective Function Computation Time", subtitle = "10 cell types × 500 genes, C++ implementation", x = "", y = "Time per call (ms)" ) + theme_minimal(base_size = 12) + theme(legend.position = "none") ``` ## Memory Usage darwin is designed to be memory-efficient: ```{r memory, eval=FALSE} # Memory usage estimation (not run in vignette) library(pryr) # Reference data n_ct <- 10 n_genes <- 5000 reference <- matrix(rnorm(n_ct * n_genes), nrow = n_ct) # Memory for reference cat("Reference matrix:", object_size(reference) / 1e6, "MB\n") # Memory for darwin object dw <- darwin(reference) cat("darwin object (initialized):", object_size(dw) / 1e6, "MB\n") # After optimization dw$optimize(ngen = 50, verbose = FALSE) cat("darwin object (optimized):", object_size(dw) / 1e6, "MB\n") ``` ## Recommendations Based on these benchmarks: | Problem Size | Recommended Settings | |-------------|---------------------| | Small (<500 genes) | `ngen = 100`, `pop_size = 100` | | Medium (500-2000 genes) | `ngen = 80`, `pop_size = 80` | | Large (>2000 genes) | `ngen = 50`, `pop_size = 50`, enable parallel | ### Performance Tips 1. **Use C++ implementations**: Enabled by default (`use_cpp = TRUE`) 2. **Enable parallelization**: For large problems, set `parallel = TRUE` 3. **Pre-filter genes**: Use `use_highly_variable = TRUE` with Seurat objects 4. **Reduce population gradually**: Start with smaller populations for exploration ## Session Info ```{r session} sessionInfo() ```