Performance Benchmark

Introduction

This vignette benchmarks the computational performance of SCENT methods, helping users choose the appropriate method for their dataset size.

library(SCENT)
library(ggplot2)

data(net13Jun12.m)

Performance Comparison

Small Dataset (50 cells)

set.seed(42)
n_genes <- 5500

# Create test data
exp_50 <- matrix(rpois(n_genes * 50, 5), nrow = n_genes)
rownames(exp_50) <- head(rownames(net13Jun12.m), n_genes)

# Benchmark
t_ccat_50 <- system.time({
  ccat_50 <- CompCCAT(exp_50, net13Jun12.m)
})[3]

t_integ_50 <- system.time({
  integ_50 <- DoIntegPPI(exp_50, net13Jun12.m)
})[3]

t_sr_50 <- system.time({
  sr_50 <- CompSRana(integ_50)
})[3]

cat("50 cells benchmark:\n")
#> 50 cells benchmark:
cat("  CCAT:", round(t_ccat_50, 3), "seconds\n")
#>   CCAT: 0.144 seconds
cat("  DoIntegPPI:", round(t_integ_50, 3), "seconds\n")
#>   DoIntegPPI: 0.66 seconds
cat("  CompSRana:", round(t_sr_50, 3), "seconds\n")
#>   CompSRana: 1.182 seconds
cat("  Total SR pipeline:", round(t_integ_50 + t_sr_50, 3), "seconds\n")
#>   Total SR pipeline: 1.842 seconds

Medium Dataset (200 cells)

exp_200 <- matrix(rpois(n_genes * 200, 5), nrow = n_genes)
rownames(exp_200) <- head(rownames(net13Jun12.m), n_genes)

t_ccat_200 <- system.time({
  ccat_200 <- CompCCAT(exp_200, net13Jun12.m)
})[3]

t_integ_200 <- system.time({
  integ_200 <- DoIntegPPI(exp_200, net13Jun12.m)
})[3]

t_sr_200 <- system.time({
  sr_200 <- CompSRana(integ_200)
})[3]

cat("200 cells benchmark:\n")
#> 200 cells benchmark:
cat("  CCAT:", round(t_ccat_200, 3), "seconds\n")
#>   CCAT: 0.155 seconds
cat("  DoIntegPPI:", round(t_integ_200, 3), "seconds\n")
#>   DoIntegPPI: 0.653 seconds
cat("  CompSRana:", round(t_sr_200, 3), "seconds\n")
#>   CompSRana: 2.666 seconds
cat("  Total SR pipeline:", round(t_integ_200 + t_sr_200, 3), "seconds\n")
#>   Total SR pipeline: 3.319 seconds

Performance Summary

bench_df <- data.frame(
  Cells = c(50, 200),
  CCAT = c(t_ccat_50, t_ccat_200),
  SR_Total = c(t_integ_50 + t_sr_50, t_integ_200 + t_sr_200)
)

bench_df$Speedup <- round(bench_df$SR_Total / bench_df$CCAT, 1)

knitr::kable(
  bench_df,
  col.names = c("Cells", "CCAT (s)", "SR Total (s)", "SR/CCAT Ratio"),
  caption = "Performance Comparison",
  digits = 3
)

Performance Comparison
Cells	CCAT (s)	SR Total (s)	SR/CCAT Ratio
50	0.144	1.842	12.8
200	0.155	3.319	21.4

Scaling Analysis

# Test different cell numbers
cell_counts <- c(20, 50, 100, 200)
ccat_times <- numeric(length(cell_counts))
sr_times <- numeric(length(cell_counts))

for (i in seq_along(cell_counts)) {
  n <- cell_counts[i]
  exp_test <- matrix(rpois(n_genes * n, 5), nrow = n_genes)
  rownames(exp_test) <- head(rownames(net13Jun12.m), n_genes)
  
  ccat_times[i] <- system.time(CompCCAT(exp_test, net13Jun12.m))[3]
  
  integ_test <- DoIntegPPI(exp_test, net13Jun12.m)
  sr_times[i] <- system.time(CompSRana(integ_test))[3]
}

scaling_df <- data.frame(
  Cells = rep(cell_counts, 2),
  Time = c(ccat_times, sr_times),
  Method = rep(c("CCAT", "SR"), each = length(cell_counts))
)

ggplot(scaling_df, aes(x = Cells, y = Time, color = Method)) +
  geom_point(size = 3) +
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "Computational Scaling",
    subtitle = "Time vs Number of Cells",
    x = "Number of Cells",
    y = "Time (seconds)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "top"
  )

Extrapolated Performance

Based on the scaling analysis, we can estimate performance for larger datasets:

# Linear extrapolation for estimation
sr_per_cell <- mean(sr_times / cell_counts)
ccat_per_cell <- mean(ccat_times / cell_counts)

large_cells <- c(500, 1000, 2000, 5000, 10000)
est_sr <- large_cells * sr_per_cell
est_ccat <- large_cells * ccat_per_cell

est_df <- data.frame(
  Cells = large_cells,
  CCAT_est = est_ccat,
  SR_est = est_sr
)

est_df$CCAT_fmt <- ifelse(est_df$CCAT_est < 60, 
                          paste0(round(est_df$CCAT_est, 1), "s"),
                          paste0(round(est_df$CCAT_est/60, 1), "min"))

est_df$SR_fmt <- ifelse(est_df$SR_est < 60,
                        paste0(round(est_df$SR_est, 1), "s"),
                        paste0(round(est_df$SR_est/60, 1), "min"))

knitr::kable(
  est_df[, c("Cells", "CCAT_fmt", "SR_fmt")],
  col.names = c("Cells", "CCAT (estimated)", "SR (estimated)"),
  caption = "Estimated Performance for Large Datasets"
)

Estimated Performance for Large Datasets
Cells	CCAT (estimated)	SR (estimated)
500	1.5s	14.4s
1000	3s	28.7s
2000	6.1s	57.4s
5000	15.2s	2.4min
10000	30.5s	4.8min

Recommendations

Dataset Size Guidelines

Dataset Size	Recommended Method	Reasoning
< 500 cells	Either	Both methods are fast
500-2000 cells	SR preferred	Still manageable, more accurate
2000-10000 cells	CCAT for screening	SR only on interesting subsets
> 10000 cells	CCAT	SR would be too slow

Workflow for Large Datasets

For very large datasets (>5000 cells):

Screen with CCAT (fast)
Identify interesting populations based on CCAT scores
Apply SR to subsets for validation

# Example workflow for large dataset
# 1. Quick CCAT screening
ccat_all <- CompCCAT(large_exp_matrix, net13Jun12.m)

# 2. Identify high-potency cells (top 10%)
high_potency_idx <- which(ccat_all > quantile(ccat_all, 0.9))

# 3. Detailed SR analysis on subset
exp_subset <- large_exp_matrix[, high_potency_idx]
integ_subset <- DoIntegPPI(exp_subset, net13Jun12.m)
sr_subset <- CompSRana(integ_subset)

Memory Usage

# Approximate memory for different sizes
mem_df <- data.frame(
  Cells = c(100, 500, 1000, 5000),
  Genes = rep(5000, 4),
  Expression_MB = c(100, 500, 1000, 5000) * 5000 * 8 / 1e6,
  Network_MB = rep(5000 * 5000 * 8 / 1e6, 4)
)

mem_df$Total_MB <- mem_df$Expression_MB + mem_df$Network_MB

knitr::kable(
  mem_df,
  col.names = c("Cells", "Genes", "Expression (MB)", "Network (MB)", "Total (MB)"),
  caption = "Approximate Memory Requirements",
  digits = 1
)

Approximate Memory Requirements
Cells	Genes	Expression (MB)	Network (MB)	Total (MB)
100	5000	4	200	204
500	5000	20	200	220
1000	5000	40	200	240
5000	5000	200	200	400

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] Matrix_1.7-5   ggplot2_4.0.3  SCENT_2.0.0    rmarkdown_2.31
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6       jsonlite_2.0.0     dplyr_1.2.1        compiler_4.6.0    
#>  [5] tidyselect_1.2.1   Rcpp_1.1.1-1.1     jquerylib_0.1.4    scales_1.4.0      
#>  [9] yaml_2.3.12        fastmap_1.2.0      lattice_0.22-9     R6_2.6.1          
#> [13] labeling_0.4.3     generics_0.1.4     igraph_2.3.2       knitr_1.51        
#> [17] tibble_3.3.1       maketools_1.3.2    bslib_0.11.0       pillar_1.11.1     
#> [21] RColorBrewer_1.1-3 rlang_1.2.0        cachem_1.1.0       xfun_0.59         
#> [25] sass_0.4.10        sys_3.4.3          S7_0.2.2           otel_0.2.0        
#> [29] cli_3.6.6          withr_3.0.3        magrittr_2.0.5     digest_0.6.39     
#> [33] grid_4.6.0         lifecycle_1.0.5    vctrs_0.7.3        evaluate_1.0.5    
#> [37] glue_1.8.1         farver_2.1.2       buildtools_1.0.0   tools_4.6.0       
#> [41] pkgconfig_2.0.3    htmltools_0.5.9