Performance Benchmark

Introduction

This vignette benchmarks the computational performance of SCENT methods, helping users choose the appropriate method for their dataset size.

library(SCENT)
library(ggplot2)

data(net13Jun12.m)

Performance Comparison

Small Dataset (50 cells)

set.seed(42)
n_genes <- 5500

# Create test data
exp_50 <- matrix(rpois(n_genes * 50, 5), nrow = n_genes)
rownames(exp_50) <- head(rownames(net13Jun12.m), n_genes)

# Benchmark
t_ccat_50 <- system.time({
  ccat_50 <- CompCCAT(exp_50, net13Jun12.m)
})[3]

t_integ_50 <- system.time({
  integ_50 <- DoIntegPPI(exp_50, net13Jun12.m)
})[3]

t_sr_50 <- system.time({
  sr_50 <- CompSRana(integ_50)
})[3]

cat("50 cells benchmark:\n")
#> 50 cells benchmark:
cat("  CCAT:", round(t_ccat_50, 3), "seconds\n")
#>   CCAT: 0.144 seconds
cat("  DoIntegPPI:", round(t_integ_50, 3), "seconds\n")
#>   DoIntegPPI: 0.66 seconds
cat("  CompSRana:", round(t_sr_50, 3), "seconds\n")
#>   CompSRana: 1.182 seconds
cat("  Total SR pipeline:", round(t_integ_50 + t_sr_50, 3), "seconds\n")
#>   Total SR pipeline: 1.842 seconds

Medium Dataset (200 cells)

exp_200 <- matrix(rpois(n_genes * 200, 5), nrow = n_genes)
rownames(exp_200) <- head(rownames(net13Jun12.m), n_genes)

t_ccat_200 <- system.time({
  ccat_200 <- CompCCAT(exp_200, net13Jun12.m)
})[3]

t_integ_200 <- system.time({
  integ_200 <- DoIntegPPI(exp_200, net13Jun12.m)
})[3]

t_sr_200 <- system.time({
  sr_200 <- CompSRana(integ_200)
})[3]

cat("200 cells benchmark:\n")
#> 200 cells benchmark:
cat("  CCAT:", round(t_ccat_200, 3), "seconds\n")
#>   CCAT: 0.155 seconds
cat("  DoIntegPPI:", round(t_integ_200, 3), "seconds\n")
#>   DoIntegPPI: 0.653 seconds
cat("  CompSRana:", round(t_sr_200, 3), "seconds\n")
#>   CompSRana: 2.666 seconds
cat("  Total SR pipeline:", round(t_integ_200 + t_sr_200, 3), "seconds\n")
#>   Total SR pipeline: 3.319 seconds

Performance Summary

bench_df <- data.frame(
  Cells = c(50, 200),
  CCAT = c(t_ccat_50, t_ccat_200),
  SR_Total = c(t_integ_50 + t_sr_50, t_integ_200 + t_sr_200)
)

bench_df$Speedup <- round(bench_df$SR_Total / bench_df$CCAT, 1)

knitr::kable(
  bench_df,
  col.names = c("Cells", "CCAT (s)", "SR Total (s)", "SR/CCAT Ratio"),
  caption = "Performance Comparison",
  digits = 3
)
Performance Comparison
Cells CCAT (s) SR Total (s) SR/CCAT Ratio
50 0.144 1.842 12.8
200 0.155 3.319 21.4

Scaling Analysis

# Test different cell numbers
cell_counts <- c(20, 50, 100, 200)
ccat_times <- numeric(length(cell_counts))
sr_times <- numeric(length(cell_counts))

for (i in seq_along(cell_counts)) {
  n <- cell_counts[i]
  exp_test <- matrix(rpois(n_genes * n, 5), nrow = n_genes)
  rownames(exp_test) <- head(rownames(net13Jun12.m), n_genes)
  
  ccat_times[i] <- system.time(CompCCAT(exp_test, net13Jun12.m))[3]
  
  integ_test <- DoIntegPPI(exp_test, net13Jun12.m)
  sr_times[i] <- system.time(CompSRana(integ_test))[3]
}

scaling_df <- data.frame(
  Cells = rep(cell_counts, 2),
  Time = c(ccat_times, sr_times),
  Method = rep(c("CCAT", "SR"), each = length(cell_counts))
)

ggplot(scaling_df, aes(x = Cells, y = Time, color = Method)) +
  geom_point(size = 3) +
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "Computational Scaling",
    subtitle = "Time vs Number of Cells",
    x = "Number of Cells",
    y = "Time (seconds)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "top"
  )

Extrapolated Performance

Based on the scaling analysis, we can estimate performance for larger datasets:

# Linear extrapolation for estimation
sr_per_cell <- mean(sr_times / cell_counts)
ccat_per_cell <- mean(ccat_times / cell_counts)

large_cells <- c(500, 1000, 2000, 5000, 10000)
est_sr <- large_cells * sr_per_cell
est_ccat <- large_cells * ccat_per_cell

est_df <- data.frame(
  Cells = large_cells,
  CCAT_est = est_ccat,
  SR_est = est_sr
)

est_df$CCAT_fmt <- ifelse(est_df$CCAT_est < 60, 
                          paste0(round(est_df$CCAT_est, 1), "s"),
                          paste0(round(est_df$CCAT_est/60, 1), "min"))

est_df$SR_fmt <- ifelse(est_df$SR_est < 60,
                        paste0(round(est_df$SR_est, 1), "s"),
                        paste0(round(est_df$SR_est/60, 1), "min"))

knitr::kable(
  est_df[, c("Cells", "CCAT_fmt", "SR_fmt")],
  col.names = c("Cells", "CCAT (estimated)", "SR (estimated)"),
  caption = "Estimated Performance for Large Datasets"
)
Estimated Performance for Large Datasets
Cells CCAT (estimated) SR (estimated)
500 1.5s 14.4s
1000 3s 28.7s
2000 6.1s 57.4s
5000 15.2s 2.4min
10000 30.5s 4.8min

Recommendations

Dataset Size Guidelines

Dataset Size Recommended Method Reasoning
< 500 cells Either Both methods are fast
500-2000 cells SR preferred Still manageable, more accurate
2000-10000 cells CCAT for screening SR only on interesting subsets
> 10000 cells CCAT SR would be too slow

Workflow for Large Datasets

For very large datasets (>5000 cells):

  1. Screen with CCAT (fast)
  2. Identify interesting populations based on CCAT scores
  3. Apply SR to subsets for validation
# Example workflow for large dataset
# 1. Quick CCAT screening
ccat_all <- CompCCAT(large_exp_matrix, net13Jun12.m)

# 2. Identify high-potency cells (top 10%)
high_potency_idx <- which(ccat_all > quantile(ccat_all, 0.9))

# 3. Detailed SR analysis on subset
exp_subset <- large_exp_matrix[, high_potency_idx]
integ_subset <- DoIntegPPI(exp_subset, net13Jun12.m)
sr_subset <- CompSRana(integ_subset)

Memory Usage

# Approximate memory for different sizes
mem_df <- data.frame(
  Cells = c(100, 500, 1000, 5000),
  Genes = rep(5000, 4),
  Expression_MB = c(100, 500, 1000, 5000) * 5000 * 8 / 1e6,
  Network_MB = rep(5000 * 5000 * 8 / 1e6, 4)
)

mem_df$Total_MB <- mem_df$Expression_MB + mem_df$Network_MB

knitr::kable(
  mem_df,
  col.names = c("Cells", "Genes", "Expression (MB)", "Network (MB)", "Total (MB)"),
  caption = "Approximate Memory Requirements",
  digits = 1
)
Approximate Memory Requirements
Cells Genes Expression (MB) Network (MB) Total (MB)
100 5000 4 200 204
500 5000 20 200 220
1000 5000 40 200 240
5000 5000 200 200 400

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] Matrix_1.7-5   ggplot2_4.0.3  SCENT_2.0.0    rmarkdown_2.31
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6       jsonlite_2.0.0     dplyr_1.2.1        compiler_4.6.0    
#>  [5] tidyselect_1.2.1   Rcpp_1.1.1-1.1     jquerylib_0.1.4    scales_1.4.0      
#>  [9] yaml_2.3.12        fastmap_1.2.0      lattice_0.22-9     R6_2.6.1          
#> [13] labeling_0.4.3     generics_0.1.4     igraph_2.3.2       knitr_1.51        
#> [17] tibble_3.3.1       maketools_1.3.2    bslib_0.11.0       pillar_1.11.1     
#> [21] RColorBrewer_1.1-3 rlang_1.2.0        cachem_1.1.0       xfun_0.59         
#> [25] sass_0.4.10        sys_3.4.3          S7_0.2.2           otel_0.2.0        
#> [29] cli_3.6.6          withr_3.0.3        magrittr_2.0.5     digest_0.6.39     
#> [33] grid_4.6.0         lifecycle_1.0.5    vctrs_0.7.3        evaluate_1.0.5    
#> [37] glue_1.8.1         farver_2.1.2       buildtools_1.0.0   tools_4.6.0       
#> [41] pkgconfig_2.0.3    htmltools_0.5.9