---
title: "Performance Benchmarking"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Performance Benchmarking}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  message = FALSE,
  warning = FALSE
)
```

## Overview

This vignette benchmarks MAGICR performance across different data sizes and parameter configurations, helping users optimize their analysis workflow.

## Setup

```{r}
library(MAGICR)
library(Matrix)

# Helper function for timing
benchmark <- function(expr, name = "Operation") {
  start <- Sys.time()
  result <- eval(expr)
  elapsed <- as.numeric(difftime(Sys.time(), start, units = "secs"))
  cat(sprintf("%s: %.2f seconds\n", name, elapsed))
  invisible(list(result = result, time = elapsed))
}
```

## Data Size Impact

### Generating Test Data

```{r}
# Generate sparse test matrices of varying sizes
generate_test_data <- function(n_cells, n_genes, sparsity = 0.9) {
  set.seed(42)
  data <- matrix(rpois(n_cells * n_genes, lambda = 2), 
                 nrow = n_cells, ncol = n_genes)
  data[runif(length(data)) < sparsity] <- 0
  colnames(data) <- paste0("Gene", seq_len(n_genes))
  rownames(data) <- paste0("Cell", seq_len(n_cells))
  data
}

# Test datasets
sizes <- list(
  small = c(100, 500),
  medium = c(500, 1000),
  large = c(1000, 2000)
)
```

### Benchmarking Different Sizes

```{r}
results <- list()

for (size_name in names(sizes)) {
  n_cells <- sizes[[size_name]][1]
  n_genes <- sizes[[size_name]][2]
  
  cat(sprintf("\n=== %s dataset: %d cells x %d genes ===\n", 
              size_name, n_cells, n_genes))
  
  test_data <- generate_test_data(n_cells, n_genes)
  
  # Benchmark
  bench <- benchmark(
    magic(test_data, t = 3, verbose = FALSE),
    name = sprintf("MAGIC (%s)", size_name)
  )
  
  results[[size_name]] <- bench$time
}
```

### Visualization

```{r fig.width=6, fig.height=4}
# Plot timing results
barplot(unlist(results), 
        names.arg = names(results),
        col = c("#3498db", "#e74c3c", "#2ecc71"),
        main = "MAGIC Runtime by Dataset Size",
        ylab = "Time (seconds)",
        xlab = "Dataset Size")
```

## Solver Comparison

### Exact vs Approximate

```{r}
# Medium-sized test data
test_data <- generate_test_data(500, 1000)

cat("=== Solver Comparison ===\n\n")

# Exact solver
exact_bench <- benchmark(
  magic(test_data, t = 3, solver = "exact", verbose = FALSE),
  name = "Exact solver"
)

# Approximate solver
approx_bench <- benchmark(
  magic(test_data, t = 3, solver = "approximate", npca = 50, verbose = FALSE),
  name = "Approximate solver"
)

cat(sprintf("\nSpeedup: %.1fx\n", exact_bench$time / approx_bench$time))
```

### Accuracy Comparison

```{r fig.width=8, fig.height=4}
# Compare results
exact_result <- as.matrix(exact_bench$result)
approx_result <- as.matrix(approx_bench$result)

# Correlation between methods
cor_val <- cor(as.vector(exact_result), as.vector(approx_result))
cat(sprintf("Correlation between exact and approximate: %.4f\n", cor_val))

# Visualization
par(mfrow = c(1, 2))

# Scatter plot
plot(as.vector(exact_result)[1:5000], 
     as.vector(approx_result)[1:5000],
     pch = 16, col = adjustcolor("#3498db", 0.3),
     xlab = "Exact Solver", ylab = "Approximate Solver",
     main = sprintf("Solver Agreement (r = %.3f)", cor_val))
abline(0, 1, col = "red", lwd = 2)

# Difference distribution
diff <- exact_result - approx_result
hist(as.vector(diff), breaks = 50, 
     main = "Difference Distribution",
     xlab = "Exact - Approximate",
     col = "#e74c3c", border = "white")
abline(v = 0, col = "black", lwd = 2, lty = 2)
```

## Parameter Impact

### Effect of t (Diffusion Time)

```{r fig.width=8, fig.height=4}
test_data <- generate_test_data(200, 500)

t_values <- c(1, 2, 3, 5, 10)
t_times <- numeric(length(t_values))

for (i in seq_along(t_values)) {
  start <- Sys.time()
  magic(test_data, t = t_values[i], verbose = FALSE)
  t_times[i] <- as.numeric(difftime(Sys.time(), start, units = "secs"))
}

par(mfrow = c(1, 2))

# Runtime
plot(t_values, t_times, type = "b", pch = 16, col = "#3498db",
     xlab = "Diffusion Time (t)", ylab = "Runtime (seconds)",
     main = "Runtime vs Diffusion Time")

# Relative increase
plot(t_values, t_times / t_times[1], type = "b", pch = 16, col = "#e74c3c",
     xlab = "Diffusion Time (t)", ylab = "Relative Runtime",
     main = "Relative Runtime Increase")
```

### Effect of knn

```{r fig.width=6, fig.height=4}
knn_values <- c(3, 5, 10, 15, 20)
knn_times <- numeric(length(knn_values))

for (i in seq_along(knn_values)) {
  start <- Sys.time()
  magic(test_data, t = 3, knn = knn_values[i], verbose = FALSE)
  knn_times[i] <- as.numeric(difftime(Sys.time(), start, units = "secs"))
}

plot(knn_values, knn_times, type = "b", pch = 16, col = "#2ecc71",
     xlab = "k-Nearest Neighbors", ylab = "Runtime (seconds)",
     main = "Runtime vs knn Parameter")
```

### Effect of npca

```{r fig.width=6, fig.height=4}
npca_values <- c(20, 50, 100, 150)
npca_times <- numeric(length(npca_values))

for (i in seq_along(npca_values)) {
  start <- Sys.time()
  magic(test_data, t = 3, npca = npca_values[i], verbose = FALSE)
  npca_times[i] <- as.numeric(difftime(Sys.time(), start, units = "secs"))
}

plot(npca_values, npca_times, type = "b", pch = 16, col = "#9b59b6",
     xlab = "Number of PCA Components", ylab = "Runtime (seconds)",
     main = "Runtime vs npca Parameter")
```

## Memory Usage

### Sparse vs Dense Input

```{r}
# Create sparse and dense versions
test_dense <- generate_test_data(300, 800)
test_sparse <- Matrix::Matrix(test_dense, sparse = TRUE)

cat("=== Memory Comparison ===\n")
cat(sprintf("Dense matrix size: %.2f MB\n", 
            object.size(test_dense) / 1024^2))
cat(sprintf("Sparse matrix size: %.2f MB\n", 
            object.size(test_sparse) / 1024^2))
cat(sprintf("Compression ratio: %.1fx\n\n",
            as.numeric(object.size(test_dense)) / 
            as.numeric(object.size(test_sparse))))

# Benchmark both
dense_bench <- benchmark(
  magic(test_dense, t = 3, verbose = FALSE),
  name = "Dense input"
)

sparse_bench <- benchmark(
  magic(test_sparse, t = 3, verbose = FALSE),
  name = "Sparse input"
)
```

## Recommendations

Based on benchmarking results:

### Small Datasets (<1,000 cells)

```{r eval=FALSE}
# Use exact solver with default parameters
result <- magic(data, t = 3, solver = "exact")
```

### Medium Datasets (1,000-10,000 cells)

```{r eval=FALSE}
# Consider approximate solver
result <- magic(data, t = 3, solver = "approximate", npca = 100)
```

### Large Datasets (>10,000 cells)

```{r eval=FALSE}
# Use approximate solver with reduced PCA
# Enable parallel processing
library(future)
plan(multisession, workers = 4)

result <- magic(data, 
                t = 3, 
                solver = "approximate", 
                npca = 50,
                knn = 5)
```

### Memory-Constrained Environments

```{r eval=FALSE}
# Use sparse matrices
# Reduce npca
# Impute only genes of interest
result <- magic(data,
                genes = important_genes,
                solver = "approximate",
                npca = 30)
```

## Summary Table

| Dataset Size | Recommended Solver | npca | Expected Time |
|-------------|-------------------|------|---------------|
| <500 cells | exact | 100 | <5 sec |
| 500-2000 cells | exact/approximate | 100 | 5-30 sec |
| 2000-10000 cells | approximate | 50-100 | 30 sec - 5 min |
| >10000 cells | approximate + parallel | 30-50 | 5-30 min |

## Session Info

```{r}
sessionInfo()
```