---
title: "Performance Optimization"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Performance Optimization}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  message = FALSE,
  warning = FALSE
)
```

## Introduction

CytoSPACER is designed for high performance through C++ implementations and parallel processing. This vignette provides guidance on optimizing performance for large datasets.

## Performance Architecture

CytoSPACER uses several strategies for optimal performance:

1. **C++ Backend**: Core algorithms implemented in C++ via Rcpp
2. **Parallel Processing**: Multi-core support via the `future` framework
3. **Chunked Processing**: Memory-efficient handling of large datasets
4. **Sparse Matrix Support**: Efficient handling of sparse data

```{r load-library}
library(CytoSPACER)
```

## Benchmarking

### C++ vs R Implementation

The C++ implementations provide significant speedups:

```{r benchmark-cpp}
# Create test data
set.seed(42)
n_genes <- 500
n_cells <- 1000
n_spots <- 200

sc_data <- matrix(rnorm(n_genes * n_cells), nrow = n_genes, ncol = n_cells)
st_data <- matrix(rnorm(n_genes * n_spots), nrow = n_genes, ncol = n_spots)
rownames(sc_data) <- rownames(st_data) <- paste0("Gene", seq_len(n_genes))
colnames(sc_data) <- paste0("Cell", seq_len(n_cells))
colnames(st_data) <- paste0("Spot", seq_len(n_spots))

# Benchmark correlation computation
time_cpp <- system.time({
  cost_cpp <- compute_cost_matrix(sc_data, st_data, method = "pearson", use_cpp = TRUE)
})

time_r <- system.time({
  cost_r <- compute_cost_matrix(sc_data, st_data, method = "pearson", use_cpp = FALSE)
})

cat("C++ implementation:", round(time_cpp["elapsed"], 3), "seconds\n")
cat("R implementation:", round(time_r["elapsed"], 3), "seconds\n")
cat("Speedup:", round(time_r["elapsed"] / time_cpp["elapsed"], 1), "x\n")
```

### LAP Solver Performance

```{r benchmark-lap}
# Create cost matrices of different sizes
sizes <- c(100, 200, 500)
times <- numeric(length(sizes))

for (i in seq_along(sizes)) {
  n <- sizes[i]
  cost <- matrix(runif(n * n), nrow = n, ncol = n)
  
  times[i] <- system.time({
    assignment <- solve_lap(cost)
  })["elapsed"]
}

# Display results
results_df <- data.frame(
  Size = paste0(sizes, "x", sizes),
  Time_sec = round(times, 4)
)
print(results_df)
```

## Memory Optimization

### Sparse Matrix Handling

CytoSPACER efficiently handles sparse matrices:

```{r sparse-handling}
library(Matrix)

# Create sparse data (typical for scRNA-seq)
sparse_data <- rsparsematrix(1000, 5000, density = 0.1)
rownames(sparse_data) <- paste0("Gene", 1:1000)
colnames(sparse_data) <- paste0("Cell", 1:5000)

# Check memory usage
dense_size <- object.size(as.matrix(sparse_data))
sparse_size <- object.size(sparse_data)

cat("Dense matrix size:", format(dense_size, units = "MB"), "\n")
cat("Sparse matrix size:", format(sparse_size, units = "MB"), "\n")
cat("Memory savings:", round((1 - as.numeric(sparse_size)/as.numeric(dense_size)) * 100), "%\n")
```

### Downsampling

For very large scRNA-seq datasets, downsampling reduces memory and computation:

```{r downsampling}
# Create large count matrix
large_counts <- matrix(rpois(500 * 10000, lambda = 10), nrow = 500, ncol = 10000)
rownames(large_counts) <- paste0("Gene", 1:500)
colnames(large_counts) <- paste0("Cell", 1:10000)

# Downsample to target count
time_downsample <- system.time({
  downsampled <- downsample_expression(large_counts, target_count = 1500, seed = 42)
})

cat("Downsampling time:", round(time_downsample["elapsed"], 3), "seconds\n")
cat("Original total counts:", sum(large_counts[,1]), "\n")
cat("Downsampled total counts:", sum(downsampled[,1]), "\n")
```

## Parallel Processing

### Setting Up Workers

```{r parallel-setup}
library(future)

# Check available cores
n_cores <- parallel::detectCores()
cat("Available cores:", n_cores, "\n")

# Recommended: use n_cores - 1 for parallel processing
n_workers <- max(1, n_cores - 1)
cat("Recommended workers:", n_workers, "\n")
```

### Parallel LAP Solving

```{r parallel-lap}
# Create multiple cost matrices
cost_list <- lapply(1:4, function(i) {
  matrix(runif(200 * 200), nrow = 200, ncol = 200)
})

# Sequential processing
time_seq <- system.time({
  results_seq <- lapply(cost_list, solve_lap)
})

# Parallel processing
time_par <- system.time({
  results_par <- solve_lap_parallel(cost_list, n_workers = 2, progress = FALSE)
})

cat("Sequential time:", round(time_seq["elapsed"], 3), "seconds\n")
cat("Parallel time:", round(time_par["elapsed"], 3), "seconds\n")
```

### Using Future Plans

```{r future-plans, eval=FALSE}
library(future)

# Multisession (recommended for most cases)
plan(multisession, workers = 4)

# Multicore (Unix/macOS only, more efficient)
plan(multicore, workers = 4)

# Sequential (for debugging)
plan(sequential)

# Run CytoSPACER with custom plan
results <- run_cytospace(
  sc_data = sc_data,
  cell_types = cell_types,
  st_data = st_data,
  coordinates = coordinates,
  n_workers = NULL  # Uses current plan
)

# Reset to default
plan(sequential)
```

## Chunked Processing

For very large datasets, CytoSPACER uses chunked processing:

```{r chunked-processing, eval=FALSE}
# Large dataset example
results <- run_cytospace(
  sc_data = sc_data,
  cell_types = cell_types,
  st_data = st_data,
  coordinates = coordinates,
  chunk_size = 5000,  # Process 5000 cells at a time
  n_workers = 4
)
```

### How Chunking Works

1. **Partition**: Cells are divided into chunks of `chunk_size`
2. **Subproblem Creation**: Each chunk creates a subproblem with relevant spots
3. **Parallel Solving**: Subproblems are solved independently
4. **Aggregation**: Results are combined into final assignment

```{r chunking-demo}
# Demonstrate partitioning
n_cells <- 15000
chunk_size <- 5000

n_chunks <- ceiling(n_cells / chunk_size)
cat("Total cells:", n_cells, "\n")
cat("Chunk size:", chunk_size, "\n")
cat("Number of chunks:", n_chunks, "\n")
```

## Optimization Guidelines

### Dataset Size Recommendations

| Dataset Size | Recommended Settings |
|--------------|---------------------|
| < 10,000 cells | Default settings |
| 10,000 - 50,000 cells | `n_workers = 4`, `chunk_size = 10000` |
| 50,000 - 200,000 cells | `n_workers = 8`, `chunk_size = 5000`, `downsample = TRUE` |
| > 200,000 cells | Subsample reference, `n_workers = max`, `chunk_size = 5000` |

### Memory Guidelines

```{r memory-guidelines}
# Estimate memory requirements
estimate_memory <- function(n_cells, n_spots, n_genes) {
  # Cost matrix: n_cells x n_spots (double precision)
  cost_matrix_mb <- (n_cells * n_spots * 8) / (1024^2)
  
  # Expression matrices
  sc_matrix_mb <- (n_genes * n_cells * 8) / (1024^2)
  st_matrix_mb <- (n_genes * n_spots * 8) / (1024^2)
  
  total_mb <- cost_matrix_mb + sc_matrix_mb + st_matrix_mb
  
  cat("Estimated memory requirements:\n")
  cat("  Cost matrix:", round(cost_matrix_mb), "MB\n")
  cat("  scRNA-seq matrix:", round(sc_matrix_mb), "MB\n")
  cat("  ST matrix:", round(st_matrix_mb), "MB\n")
  cat("  Total (approx):", round(total_mb), "MB\n")
}

# Example: 50,000 cells, 5,000 spots, 20,000 genes
estimate_memory(50000, 5000, 20000)
```

### Best Practices

1. **Use sparse matrices** when possible (typical scRNA-seq data is >90% zeros)

2. **Downsample scRNA-seq** to ~1500 counts per cell before analysis

3. **Subset genes** to highly variable genes if memory is limited

4. **Monitor memory** during analysis:
```{r monitor-memory}
# Check current memory usage
gc_info <- gc()
mem_used <- sum(gc_info[, 2])
cat("Current memory used:", round(mem_used), "MB\n")
```

5. **Use appropriate chunk size**:
   - Larger chunks = faster but more memory
   - Smaller chunks = slower but less memory

## Profiling

### Time Profiling

```{r time-profiling}
# Profile individual steps
set.seed(42)
n_cells <- 50
n_spots <- 100

sc_test <- matrix(rpois(100 * n_cells, 10), nrow = 100)
st_test <- matrix(rpois(100 * n_spots, 50), nrow = 100)
rownames(sc_test) <- rownames(st_test) <- paste0("Gene", 1:100)
colnames(sc_test) <- paste0("Cell", 1:n_cells)
colnames(st_test) <- paste0("Spot", 1:n_spots)

# Normalization
time_norm <- system.time({
  sc_norm <- normalize_expression(sc_test)
  st_norm <- normalize_expression(st_test)
})

# Cost matrix
time_cost <- system.time({
  cost <- compute_cost_matrix(sc_norm, st_norm, method = "pearson")
})

# LAP solving (rows must be <= cols)
time_lap <- system.time({
  assignment <- solve_lap(cost)
})

cat("Timing breakdown:\n")
cat("  Normalization:", round(time_norm["elapsed"], 4), "s\n")
cat("  Cost matrix:", round(time_cost["elapsed"], 4), "s\n")
cat("  LAP solving:", round(time_lap["elapsed"], 4), "s\n")
```

## Troubleshooting Performance Issues

### Issue: Out of Memory

```{r oom-solutions, eval=FALSE}
# Solution 1: Reduce chunk size
results <- run_cytospace(..., chunk_size = 2000)

# Solution 2: Downsample more aggressively
results <- run_cytospace(..., downsample_target = 1000)

# Solution 3: Subset to highly variable genes
hvg <- VariableFeatures(seurat_obj)[1:2000]
sc_subset <- sc_data[hvg, ]
st_subset <- st_data[hvg, ]
```

### Issue: Slow Performance

```{r slow-solutions, eval=FALSE}
# Solution 1: Increase workers
results <- run_cytospace(..., n_workers = 8)

# Solution 2: Use larger chunks (if memory allows)
results <- run_cytospace(..., chunk_size = 20000)

# Solution 3: Use C++ implementations (default)
cost <- compute_cost_matrix(..., use_cpp = TRUE)
```

## Session Info

```{r session-info}
sessionInfo()
```