--- title: "Performance Benchmark" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Performance Benchmark} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) ``` ## Introduction This vignette benchmarks the computational performance of SCENT methods, helping users choose the appropriate method for their dataset size. ```{r load} library(SCENT) library(ggplot2) data(net13Jun12.m) ``` ## Performance Comparison ### Small Dataset (50 cells) ```{r bench-small} set.seed(42) n_genes <- 5500 # Create test data exp_50 <- matrix(rpois(n_genes * 50, 5), nrow = n_genes) rownames(exp_50) <- head(rownames(net13Jun12.m), n_genes) # Benchmark t_ccat_50 <- system.time({ ccat_50 <- CompCCAT(exp_50, net13Jun12.m) })[3] t_integ_50 <- system.time({ integ_50 <- DoIntegPPI(exp_50, net13Jun12.m) })[3] t_sr_50 <- system.time({ sr_50 <- CompSRana(integ_50) })[3] cat("50 cells benchmark:\n") cat(" CCAT:", round(t_ccat_50, 3), "seconds\n") cat(" DoIntegPPI:", round(t_integ_50, 3), "seconds\n") cat(" CompSRana:", round(t_sr_50, 3), "seconds\n") cat(" Total SR pipeline:", round(t_integ_50 + t_sr_50, 3), "seconds\n") ``` ### Medium Dataset (200 cells) ```{r bench-med} exp_200 <- matrix(rpois(n_genes * 200, 5), nrow = n_genes) rownames(exp_200) <- head(rownames(net13Jun12.m), n_genes) t_ccat_200 <- system.time({ ccat_200 <- CompCCAT(exp_200, net13Jun12.m) })[3] t_integ_200 <- system.time({ integ_200 <- DoIntegPPI(exp_200, net13Jun12.m) })[3] t_sr_200 <- system.time({ sr_200 <- CompSRana(integ_200) })[3] cat("200 cells benchmark:\n") cat(" CCAT:", round(t_ccat_200, 3), "seconds\n") cat(" DoIntegPPI:", round(t_integ_200, 3), "seconds\n") cat(" CompSRana:", round(t_sr_200, 3), "seconds\n") cat(" Total SR pipeline:", round(t_integ_200 + t_sr_200, 3), "seconds\n") ``` ### Performance Summary ```{r summary-table} bench_df <- data.frame( Cells = c(50, 200), CCAT = c(t_ccat_50, t_ccat_200), SR_Total = c(t_integ_50 + t_sr_50, t_integ_200 + t_sr_200) ) bench_df$Speedup <- round(bench_df$SR_Total / bench_df$CCAT, 1) knitr::kable( bench_df, col.names = c("Cells", "CCAT (s)", "SR Total (s)", "SR/CCAT Ratio"), caption = "Performance Comparison", digits = 3 ) ``` ## Scaling Analysis ```{r scaling, fig.height=5} # Test different cell numbers cell_counts <- c(20, 50, 100, 200) ccat_times <- numeric(length(cell_counts)) sr_times <- numeric(length(cell_counts)) for (i in seq_along(cell_counts)) { n <- cell_counts[i] exp_test <- matrix(rpois(n_genes * n, 5), nrow = n_genes) rownames(exp_test) <- head(rownames(net13Jun12.m), n_genes) ccat_times[i] <- system.time(CompCCAT(exp_test, net13Jun12.m))[3] integ_test <- DoIntegPPI(exp_test, net13Jun12.m) sr_times[i] <- system.time(CompSRana(integ_test))[3] } scaling_df <- data.frame( Cells = rep(cell_counts, 2), Time = c(ccat_times, sr_times), Method = rep(c("CCAT", "SR"), each = length(cell_counts)) ) ggplot(scaling_df, aes(x = Cells, y = Time, color = Method)) + geom_point(size = 3) + geom_line(linewidth = 1) + scale_color_manual(values = c("#3498db", "#e74c3c")) + labs( title = "Computational Scaling", subtitle = "Time vs Number of Cells", x = "Number of Cells", y = "Time (seconds)" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "top" ) ``` ## Extrapolated Performance Based on the scaling analysis, we can estimate performance for larger datasets: ```{r extrapolate} # Linear extrapolation for estimation sr_per_cell <- mean(sr_times / cell_counts) ccat_per_cell <- mean(ccat_times / cell_counts) large_cells <- c(500, 1000, 2000, 5000, 10000) est_sr <- large_cells * sr_per_cell est_ccat <- large_cells * ccat_per_cell est_df <- data.frame( Cells = large_cells, CCAT_est = est_ccat, SR_est = est_sr ) est_df$CCAT_fmt <- ifelse(est_df$CCAT_est < 60, paste0(round(est_df$CCAT_est, 1), "s"), paste0(round(est_df$CCAT_est/60, 1), "min")) est_df$SR_fmt <- ifelse(est_df$SR_est < 60, paste0(round(est_df$SR_est, 1), "s"), paste0(round(est_df$SR_est/60, 1), "min")) knitr::kable( est_df[, c("Cells", "CCAT_fmt", "SR_fmt")], col.names = c("Cells", "CCAT (estimated)", "SR (estimated)"), caption = "Estimated Performance for Large Datasets" ) ``` ## Recommendations ### Dataset Size Guidelines | Dataset Size | Recommended Method | Reasoning | |--------------|-------------------|-----------| | < 500 cells | Either | Both methods are fast | | 500-2000 cells | SR preferred | Still manageable, more accurate | | 2000-10000 cells | CCAT for screening | SR only on interesting subsets | | > 10000 cells | CCAT | SR would be too slow | ### Workflow for Large Datasets For very large datasets (>5000 cells): 1. **Screen with CCAT** (fast) 2. **Identify interesting populations** based on CCAT scores 3. **Apply SR to subsets** for validation ```{r workflow-example, eval=FALSE} # Example workflow for large dataset # 1. Quick CCAT screening ccat_all <- CompCCAT(large_exp_matrix, net13Jun12.m) # 2. Identify high-potency cells (top 10%) high_potency_idx <- which(ccat_all > quantile(ccat_all, 0.9)) # 3. Detailed SR analysis on subset exp_subset <- large_exp_matrix[, high_potency_idx] integ_subset <- DoIntegPPI(exp_subset, net13Jun12.m) sr_subset <- CompSRana(integ_subset) ``` ## Memory Usage ```{r memory} # Approximate memory for different sizes mem_df <- data.frame( Cells = c(100, 500, 1000, 5000), Genes = rep(5000, 4), Expression_MB = c(100, 500, 1000, 5000) * 5000 * 8 / 1e6, Network_MB = rep(5000 * 5000 * 8 / 1e6, 4) ) mem_df$Total_MB <- mem_df$Expression_MB + mem_df$Network_MB knitr::kable( mem_df, col.names = c("Cells", "Genes", "Expression (MB)", "Network (MB)", "Total (MB)"), caption = "Approximate Memory Requirements", digits = 1 ) ``` ## Session Info ```{r session} sessionInfo() ```