This vignette benchmarks the computational performance of SCENT methods, helping users choose the appropriate method for their dataset size.
set.seed(42)
n_genes <- 5500
# Create test data
exp_50 <- matrix(rpois(n_genes * 50, 5), nrow = n_genes)
rownames(exp_50) <- head(rownames(net13Jun12.m), n_genes)
# Benchmark
t_ccat_50 <- system.time({
ccat_50 <- CompCCAT(exp_50, net13Jun12.m)
})[3]
t_integ_50 <- system.time({
integ_50 <- DoIntegPPI(exp_50, net13Jun12.m)
})[3]
t_sr_50 <- system.time({
sr_50 <- CompSRana(integ_50)
})[3]
cat("50 cells benchmark:\n")
#> 50 cells benchmark:
cat(" CCAT:", round(t_ccat_50, 3), "seconds\n")
#> CCAT: 0.144 seconds
cat(" DoIntegPPI:", round(t_integ_50, 3), "seconds\n")
#> DoIntegPPI: 0.66 seconds
cat(" CompSRana:", round(t_sr_50, 3), "seconds\n")
#> CompSRana: 1.182 seconds
cat(" Total SR pipeline:", round(t_integ_50 + t_sr_50, 3), "seconds\n")
#> Total SR pipeline: 1.842 secondsexp_200 <- matrix(rpois(n_genes * 200, 5), nrow = n_genes)
rownames(exp_200) <- head(rownames(net13Jun12.m), n_genes)
t_ccat_200 <- system.time({
ccat_200 <- CompCCAT(exp_200, net13Jun12.m)
})[3]
t_integ_200 <- system.time({
integ_200 <- DoIntegPPI(exp_200, net13Jun12.m)
})[3]
t_sr_200 <- system.time({
sr_200 <- CompSRana(integ_200)
})[3]
cat("200 cells benchmark:\n")
#> 200 cells benchmark:
cat(" CCAT:", round(t_ccat_200, 3), "seconds\n")
#> CCAT: 0.155 seconds
cat(" DoIntegPPI:", round(t_integ_200, 3), "seconds\n")
#> DoIntegPPI: 0.653 seconds
cat(" CompSRana:", round(t_sr_200, 3), "seconds\n")
#> CompSRana: 2.666 seconds
cat(" Total SR pipeline:", round(t_integ_200 + t_sr_200, 3), "seconds\n")
#> Total SR pipeline: 3.319 secondsbench_df <- data.frame(
Cells = c(50, 200),
CCAT = c(t_ccat_50, t_ccat_200),
SR_Total = c(t_integ_50 + t_sr_50, t_integ_200 + t_sr_200)
)
bench_df$Speedup <- round(bench_df$SR_Total / bench_df$CCAT, 1)
knitr::kable(
bench_df,
col.names = c("Cells", "CCAT (s)", "SR Total (s)", "SR/CCAT Ratio"),
caption = "Performance Comparison",
digits = 3
)| Cells | CCAT (s) | SR Total (s) | SR/CCAT Ratio |
|---|---|---|---|
| 50 | 0.144 | 1.842 | 12.8 |
| 200 | 0.155 | 3.319 | 21.4 |
# Test different cell numbers
cell_counts <- c(20, 50, 100, 200)
ccat_times <- numeric(length(cell_counts))
sr_times <- numeric(length(cell_counts))
for (i in seq_along(cell_counts)) {
n <- cell_counts[i]
exp_test <- matrix(rpois(n_genes * n, 5), nrow = n_genes)
rownames(exp_test) <- head(rownames(net13Jun12.m), n_genes)
ccat_times[i] <- system.time(CompCCAT(exp_test, net13Jun12.m))[3]
integ_test <- DoIntegPPI(exp_test, net13Jun12.m)
sr_times[i] <- system.time(CompSRana(integ_test))[3]
}
scaling_df <- data.frame(
Cells = rep(cell_counts, 2),
Time = c(ccat_times, sr_times),
Method = rep(c("CCAT", "SR"), each = length(cell_counts))
)
ggplot(scaling_df, aes(x = Cells, y = Time, color = Method)) +
geom_point(size = 3) +
geom_line(linewidth = 1) +
scale_color_manual(values = c("#3498db", "#e74c3c")) +
labs(
title = "Computational Scaling",
subtitle = "Time vs Number of Cells",
x = "Number of Cells",
y = "Time (seconds)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "top"
)Based on the scaling analysis, we can estimate performance for larger datasets:
# Linear extrapolation for estimation
sr_per_cell <- mean(sr_times / cell_counts)
ccat_per_cell <- mean(ccat_times / cell_counts)
large_cells <- c(500, 1000, 2000, 5000, 10000)
est_sr <- large_cells * sr_per_cell
est_ccat <- large_cells * ccat_per_cell
est_df <- data.frame(
Cells = large_cells,
CCAT_est = est_ccat,
SR_est = est_sr
)
est_df$CCAT_fmt <- ifelse(est_df$CCAT_est < 60,
paste0(round(est_df$CCAT_est, 1), "s"),
paste0(round(est_df$CCAT_est/60, 1), "min"))
est_df$SR_fmt <- ifelse(est_df$SR_est < 60,
paste0(round(est_df$SR_est, 1), "s"),
paste0(round(est_df$SR_est/60, 1), "min"))
knitr::kable(
est_df[, c("Cells", "CCAT_fmt", "SR_fmt")],
col.names = c("Cells", "CCAT (estimated)", "SR (estimated)"),
caption = "Estimated Performance for Large Datasets"
)| Cells | CCAT (estimated) | SR (estimated) |
|---|---|---|
| 500 | 1.5s | 14.4s |
| 1000 | 3s | 28.7s |
| 2000 | 6.1s | 57.4s |
| 5000 | 15.2s | 2.4min |
| 10000 | 30.5s | 4.8min |
| Dataset Size | Recommended Method | Reasoning |
|---|---|---|
| < 500 cells | Either | Both methods are fast |
| 500-2000 cells | SR preferred | Still manageable, more accurate |
| 2000-10000 cells | CCAT for screening | SR only on interesting subsets |
| > 10000 cells | CCAT | SR would be too slow |
For very large datasets (>5000 cells):
# Example workflow for large dataset
# 1. Quick CCAT screening
ccat_all <- CompCCAT(large_exp_matrix, net13Jun12.m)
# 2. Identify high-potency cells (top 10%)
high_potency_idx <- which(ccat_all > quantile(ccat_all, 0.9))
# 3. Detailed SR analysis on subset
exp_subset <- large_exp_matrix[, high_potency_idx]
integ_subset <- DoIntegPPI(exp_subset, net13Jun12.m)
sr_subset <- CompSRana(integ_subset)# Approximate memory for different sizes
mem_df <- data.frame(
Cells = c(100, 500, 1000, 5000),
Genes = rep(5000, 4),
Expression_MB = c(100, 500, 1000, 5000) * 5000 * 8 / 1e6,
Network_MB = rep(5000 * 5000 * 8 / 1e6, 4)
)
mem_df$Total_MB <- mem_df$Expression_MB + mem_df$Network_MB
knitr::kable(
mem_df,
col.names = c("Cells", "Genes", "Expression (MB)", "Network (MB)", "Total (MB)"),
caption = "Approximate Memory Requirements",
digits = 1
)| Cells | Genes | Expression (MB) | Network (MB) | Total (MB) |
|---|---|---|---|---|
| 100 | 5000 | 4 | 200 | 204 |
| 500 | 5000 | 20 | 200 | 220 |
| 1000 | 5000 | 40 | 200 | 240 |
| 5000 | 5000 | 200 | 200 | 400 |
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] Matrix_1.7-5 ggplot2_4.0.3 SCENT_2.0.0 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 dplyr_1.2.1 compiler_4.6.0
#> [5] tidyselect_1.2.1 Rcpp_1.1.1-1.1 jquerylib_0.1.4 scales_1.4.0
#> [9] yaml_2.3.12 fastmap_1.2.0 lattice_0.22-9 R6_2.6.1
#> [13] labeling_0.4.3 generics_0.1.4 igraph_2.3.2 knitr_1.51
#> [17] tibble_3.3.1 maketools_1.3.2 bslib_0.11.0 pillar_1.11.1
#> [21] RColorBrewer_1.1-3 rlang_1.2.0 cachem_1.1.0 xfun_0.59
#> [25] sass_0.4.10 sys_3.4.3 S7_0.2.2 otel_0.2.0
#> [29] cli_3.6.6 withr_3.0.3 magrittr_2.0.5 digest_0.6.39
#> [33] grid_4.6.0 lifecycle_1.0.5 vctrs_0.7.3 evaluate_1.0.5
#> [37] glue_1.8.1 farver_2.1.2 buildtools_1.0.0 tools_4.6.0
#> [41] pkgconfig_2.0.3 htmltools_0.5.9