---
title: "Performance Benchmark"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Performance Benchmark}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  message = FALSE,
  warning = FALSE
)
```

## Introduction
 
This vignette benchmarks the computational performance of SCENT methods,
helping users choose the appropriate method for their dataset size.

```{r load}
library(SCENT)
library(ggplot2)

data(net13Jun12.m)
```

## Performance Comparison

### Small Dataset (50 cells)

```{r bench-small}
set.seed(42)
n_genes <- 5500

# Create test data
exp_50 <- matrix(rpois(n_genes * 50, 5), nrow = n_genes)
rownames(exp_50) <- head(rownames(net13Jun12.m), n_genes)

# Benchmark
t_ccat_50 <- system.time({
  ccat_50 <- CompCCAT(exp_50, net13Jun12.m)
})[3]

t_integ_50 <- system.time({
  integ_50 <- DoIntegPPI(exp_50, net13Jun12.m)
})[3]

t_sr_50 <- system.time({
  sr_50 <- CompSRana(integ_50)
})[3]

cat("50 cells benchmark:\n")
cat("  CCAT:", round(t_ccat_50, 3), "seconds\n")
cat("  DoIntegPPI:", round(t_integ_50, 3), "seconds\n")
cat("  CompSRana:", round(t_sr_50, 3), "seconds\n")
cat("  Total SR pipeline:", round(t_integ_50 + t_sr_50, 3), "seconds\n")
```

### Medium Dataset (200 cells)

```{r bench-med}
exp_200 <- matrix(rpois(n_genes * 200, 5), nrow = n_genes)
rownames(exp_200) <- head(rownames(net13Jun12.m), n_genes)

t_ccat_200 <- system.time({
  ccat_200 <- CompCCAT(exp_200, net13Jun12.m)
})[3]

t_integ_200 <- system.time({
  integ_200 <- DoIntegPPI(exp_200, net13Jun12.m)
})[3]

t_sr_200 <- system.time({
  sr_200 <- CompSRana(integ_200)
})[3]

cat("200 cells benchmark:\n")
cat("  CCAT:", round(t_ccat_200, 3), "seconds\n")
cat("  DoIntegPPI:", round(t_integ_200, 3), "seconds\n")
cat("  CompSRana:", round(t_sr_200, 3), "seconds\n")
cat("  Total SR pipeline:", round(t_integ_200 + t_sr_200, 3), "seconds\n")
```

### Performance Summary

```{r summary-table}
bench_df <- data.frame(
  Cells = c(50, 200),
  CCAT = c(t_ccat_50, t_ccat_200),
  SR_Total = c(t_integ_50 + t_sr_50, t_integ_200 + t_sr_200)
)

bench_df$Speedup <- round(bench_df$SR_Total / bench_df$CCAT, 1)

knitr::kable(
  bench_df,
  col.names = c("Cells", "CCAT (s)", "SR Total (s)", "SR/CCAT Ratio"),
  caption = "Performance Comparison",
  digits = 3
)
```

## Scaling Analysis

```{r scaling, fig.height=5}
# Test different cell numbers
cell_counts <- c(20, 50, 100, 200)
ccat_times <- numeric(length(cell_counts))
sr_times <- numeric(length(cell_counts))

for (i in seq_along(cell_counts)) {
  n <- cell_counts[i]
  exp_test <- matrix(rpois(n_genes * n, 5), nrow = n_genes)
  rownames(exp_test) <- head(rownames(net13Jun12.m), n_genes)
  
  ccat_times[i] <- system.time(CompCCAT(exp_test, net13Jun12.m))[3]
  
  integ_test <- DoIntegPPI(exp_test, net13Jun12.m)
  sr_times[i] <- system.time(CompSRana(integ_test))[3]
}

scaling_df <- data.frame(
  Cells = rep(cell_counts, 2),
  Time = c(ccat_times, sr_times),
  Method = rep(c("CCAT", "SR"), each = length(cell_counts))
)

ggplot(scaling_df, aes(x = Cells, y = Time, color = Method)) +
  geom_point(size = 3) +
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "Computational Scaling",
    subtitle = "Time vs Number of Cells",
    x = "Number of Cells",
    y = "Time (seconds)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "top"
  )
```

## Extrapolated Performance

Based on the scaling analysis, we can estimate performance for larger datasets:

```{r extrapolate}
# Linear extrapolation for estimation
sr_per_cell <- mean(sr_times / cell_counts)
ccat_per_cell <- mean(ccat_times / cell_counts)

large_cells <- c(500, 1000, 2000, 5000, 10000)
est_sr <- large_cells * sr_per_cell
est_ccat <- large_cells * ccat_per_cell

est_df <- data.frame(
  Cells = large_cells,
  CCAT_est = est_ccat,
  SR_est = est_sr
)

est_df$CCAT_fmt <- ifelse(est_df$CCAT_est < 60, 
                          paste0(round(est_df$CCAT_est, 1), "s"),
                          paste0(round(est_df$CCAT_est/60, 1), "min"))

est_df$SR_fmt <- ifelse(est_df$SR_est < 60,
                        paste0(round(est_df$SR_est, 1), "s"),
                        paste0(round(est_df$SR_est/60, 1), "min"))

knitr::kable(
  est_df[, c("Cells", "CCAT_fmt", "SR_fmt")],
  col.names = c("Cells", "CCAT (estimated)", "SR (estimated)"),
  caption = "Estimated Performance for Large Datasets"
)
```

## Recommendations

### Dataset Size Guidelines

| Dataset Size | Recommended Method | Reasoning |
|--------------|-------------------|-----------|
| < 500 cells | Either | Both methods are fast |
| 500-2000 cells | SR preferred | Still manageable, more accurate |
| 2000-10000 cells | CCAT for screening | SR only on interesting subsets |
| > 10000 cells | CCAT | SR would be too slow |

### Workflow for Large Datasets

For very large datasets (>5000 cells):

1. **Screen with CCAT** (fast)
2. **Identify interesting populations** based on CCAT scores
3. **Apply SR to subsets** for validation

```{r workflow-example, eval=FALSE}
# Example workflow for large dataset
# 1. Quick CCAT screening
ccat_all <- CompCCAT(large_exp_matrix, net13Jun12.m)

# 2. Identify high-potency cells (top 10%)
high_potency_idx <- which(ccat_all > quantile(ccat_all, 0.9))

# 3. Detailed SR analysis on subset
exp_subset <- large_exp_matrix[, high_potency_idx]
integ_subset <- DoIntegPPI(exp_subset, net13Jun12.m)
sr_subset <- CompSRana(integ_subset)
```

## Memory Usage

```{r memory}
# Approximate memory for different sizes
mem_df <- data.frame(
  Cells = c(100, 500, 1000, 5000),
  Genes = rep(5000, 4),
  Expression_MB = c(100, 500, 1000, 5000) * 5000 * 8 / 1e6,
  Network_MB = rep(5000 * 5000 * 8 / 1e6, 4)
)

mem_df$Total_MB <- mem_df$Expression_MB + mem_df$Network_MB

knitr::kable(
  mem_df,
  col.names = c("Cells", "Genes", "Expression (MB)", "Network (MB)", "Total (MB)"),
  caption = "Approximate Memory Requirements",
  digits = 1
)
```

## Session Info

```{r session}
sessionInfo()
```