---
title: "Advanced Usage"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 3
    fig_caption: true
vignette: >
  %\VignetteIndexEntry{Advanced Usage}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE
)
```

## Introduction

This vignette covers advanced features of darwin for power users, including custom objective functions, fixed gene count mode, parallel computing, and integration with other tools.

```{r load}
library(darwin)
library(ggplot2)
set.seed(42)
```

## Prepare Data

```{r data}
# Create reference matrix
n_ct <- 5
n_genes <- 400
reference <- matrix(abs(rnorm(n_ct * n_genes, 2, 1)), nrow = n_ct, ncol = n_genes)
rownames(reference) <- paste0("CellType", 1:n_ct)
colnames(reference) <- paste0("Gene", 1:n_genes)

# Add markers
for (i in 1:n_ct) {
  reference[i, ((i-1)*20+1):(i*20)] <- reference[i, ((i-1)*20+1):(i*20)] + 4
}
```

## Custom Objective Functions

darwin allows you to define custom objective functions for specialized applications.

### Requirements

A valid objective function must:

1. Accept a single argument: data matrix (cell types × genes)
2. Return a single numeric value
3. Be consistent (same input → same output)

### Example: Variance Objective

```{r custom-variance}
# Custom objective: maximize total variance
variance_objective <- function(data) {
  sum(apply(data, 2, var))
}

# Test the function
test_data <- reference[, 1:50]
cat("Variance objective value:", variance_objective(test_data), "\n")
```

### Example: Marker Score Objective

```{r custom-marker}
# Custom objective: maximize marker specificity
# Higher when genes are specific to individual cell types
marker_score <- function(data) {
  # Max expression / mean expression ratio
  col_max <- apply(data, 2, max)
  col_mean <- colMeans(data)
  col_mean[col_mean == 0] <- 1e-10
  sum(col_max / col_mean)
}

cat("Marker score value:", marker_score(test_data), "\n")
```

### Using Custom Objectives

```{r use-custom, fig.cap="Pareto front with custom objectives: correlation vs variance."}
dw_custom <- darwin(reference)
dw_custom$optimize(
  ngen = 60,
  objectives = c("correlation", variance_objective),
  weights = c(-1, 1),  # Minimize correlation, maximize variance
  verbose = FALSE,
  parallel = FALSE
)

# The second objective column will show variance values
fitness <- dw_custom$get_fitness()
colnames(fitness) <- c("Correlation", "Variance")

ggplot(as.data.frame(fitness), aes(x = Correlation, y = Variance)) +
  geom_point(color = "#3498db", size = 3, alpha = 0.7) +
  geom_line(color = "gray60", alpha = 0.5) +
  labs(
    title = "Custom Objectives: Correlation vs Variance",
    subtitle = "Minimize correlation, maximize variance",
    x = "Correlation (lower is better)",
    y = "Variance (higher is better)"
  ) +
  theme_minimal(base_size = 12)
```

### Three Objectives

```{r three-obj}
dw_three <- darwin(reference)
dw_three$optimize(
  ngen = 60,
  objectives = c("correlation", "distance", "condition"),
  weights = c(-1, 1, -1),  # Minimize corr/cond, maximize distance
  verbose = FALSE,
  parallel = FALSE
)

fitness3 <- dw_three$get_fitness()
cat("Three-objective optimization:\n")
cat("  Solutions:", nrow(fitness3), "\n")
cat("  Correlation range:", round(range(fitness3[,1]), 2), "\n")
cat("  Distance range:", round(range(fitness3[,2]), 2), "\n")
cat("  Condition range:", round(range(fitness3[,3]), 2), "\n")
```

## Fixed Gene Count Mode

For applications requiring a specific number of marker genes, use fixed mode:

```{r fixed-mode, fig.cap="Fixed mode ensures all solutions have exactly the specified number of genes."}
dw_fixed <- darwin(reference)
dw_fixed$optimize(
  ngen = 60,
  mode = "fixed",
  n_features = 50,  # Select exactly 50 genes
  objectives = c("correlation", "distance"),
  weights = c(-1, 1),
  verbose = FALSE,
  parallel = FALSE
)

# Verify all solutions have 50 genes
pareto <- dw_fixed$get_pareto()
gene_counts <- sapply(pareto, sum)
cat("Gene counts in fixed mode:", unique(gene_counts), "\n")

# Compare gene count distributions
df_compare <- rbind(
  data.frame(
    mode = "Standard",
    n_genes = sapply(dw_custom$get_pareto(), sum)
  ),
  data.frame(
    mode = "Fixed (50)",
    n_genes = gene_counts
  )
)

ggplot(df_compare, aes(x = n_genes, fill = mode)) +
  geom_histogram(bins = 20, alpha = 0.7, position = "identity") +
  scale_fill_manual(values = c("#3498db", "#e74c3c")) +
  labs(
    title = "Gene Count Distribution: Standard vs Fixed Mode",
    x = "Number of Selected Genes",
    y = "Frequency"
  ) +
  theme_minimal(base_size = 12)
```

## Selection Strategies

darwin provides flexible selection from the Pareto front:

### 1. Weighted Selection

```{r select-weighted}
dw <- darwin(reference)
dw$optimize(ngen = 50, verbose = FALSE, parallel = FALSE)

# Emphasize minimizing correlation
dw$select(weights = c(-2, 1))
cat("Emphasize correlation - genes:", sum(dw$get_selection()), "\n")

# Emphasize maximizing distance
dw$select(weights = c(-1, 2))
cat("Emphasize distance - genes:", sum(dw$get_selection()), "\n")
```

### 2. Index-Based Selection

```{r select-index}
# Select by direct index
dw$select(index = 1)
cat("Solution 1 - genes:", sum(dw$get_selection()), "\n")

# Select by objective rank
# (objective_index, rank) - rank 1 = best for that objective
dw$select(index = c(1, 1))  # Best correlation
cat("Best correlation - genes:", sum(dw$get_selection()), "\n")

dw$select(index = c(2, -1))  # Best distance (last rank)
cat("Best distance - genes:", sum(dw$get_selection()), "\n")
```

### 3. Target Value Selection

```{r select-target}
# Select solution closest to target correlation value
fitness <- dw$get_fitness()
target_corr <- median(fitness$correlation)
dw$select(close_to = c(1, target_corr))
cat("Target correlation", round(target_corr, 2), "- genes:", sum(dw$get_selection()), "\n")
```

## Parallel Computing

darwin supports parallel computation for faster optimization:

```{r parallel, eval=FALSE}
# Enable parallel computing
options(darwin.parallel = TRUE)

# Or specify in optimize()
dw$optimize(
  ngen = 100,
  parallel = TRUE,  # Uses all available cores - 1
  verbose = TRUE
)

# Disable parallel computing
options(darwin.parallel = FALSE)
```

### Performance Comparison

```{r benchmark, eval=FALSE}
# Benchmark (example, not run)
library(microbenchmark)

# Large dataset
large_ref <- matrix(rnorm(50 * 2000), nrow = 50)

microbenchmark(
  serial = {
    dw <- darwin(large_ref)
    dw$optimize(ngen = 10, parallel = FALSE, verbose = FALSE)
  },
  parallel = {
    dw <- darwin(large_ref)
    dw$optimize(ngen = 10, parallel = TRUE, verbose = FALSE)
  },
  times = 3
)
```

## Parameter Tuning

### Population Size

Larger populations provide more diversity but slower convergence:

```{r pop-size, fig.cap="Effect of population size on Pareto front diversity."}
results <- list()
for (pop in c(30, 60, 100)) {
  dw_temp <- darwin(reference)
  dw_temp$optimize(
    ngen = 40,
    pop_size = pop,
    verbose = FALSE,
    parallel = FALSE
  )
  results[[as.character(pop)]] <- data.frame(
    dw_temp$get_fitness(),
    pop_size = factor(pop)
  )
}

df_pop <- do.call(rbind, results)

ggplot(df_pop, aes(x = correlation, y = distance, color = pop_size)) +
  geom_point(alpha = 0.6) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Effect of Population Size",
    x = "Correlation",
    y = "Distance",
    color = "Population\nSize"
  ) +
  theme_minimal(base_size = 12)
```

### Mutation Probability

Higher mutation enables more exploration:

```{r mutation, fig.cap="Effect of mutation probability on solution diversity."}
results_mut <- list()
for (mut in c(0.001, 0.01, 0.05)) {
  dw_temp <- darwin(reference)
  dw_temp$optimize(
    ngen = 40,
    mutation_prob = mut,
    verbose = FALSE,
    parallel = FALSE
  )
  
  pareto <- dw_temp$get_pareto()
  gene_counts <- sapply(pareto, sum)
  
  results_mut[[as.character(mut)]] <- data.frame(
    n_genes = gene_counts,
    mutation = factor(mut)
  )
}

df_mut <- do.call(rbind, results_mut)

ggplot(df_mut, aes(x = n_genes, fill = mutation)) +
  geom_density(alpha = 0.5) +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Effect of Mutation Probability on Gene Count",
    x = "Number of Selected Genes",
    y = "Density",
    fill = "Mutation\nProbability"
  ) +
  theme_minimal(base_size = 12)
```

## Save and Load

darwin objects can be saved and loaded for reproducibility:

```{r save-load}
# Save
temp_file <- tempfile(fileext = ".rds")
dw$save(temp_file)

# Load
dw_loaded <- readRDS(temp_file)

# Verify
cat("Original Pareto size:", length(dw$get_pareto()), "\n")
cat("Loaded Pareto size:", length(dw_loaded$get_pareto()), "\n")

# Clean up
unlink(temp_file)
```

## Integration Examples

### With Seurat

```{r seurat-integration, eval=FALSE}
library(Seurat)

# From Seurat object with annotations
dw <- darwin(
  seurat_obj,
  celltype_key = "cell_type",
  assay = "RNA",
  layer = "data",
  use_highly_variable = TRUE
)

# Optimize and select
dw$optimize(ngen = 100)
dw$select()

# Get marker genes
markers <- dw$get_genes()
```

### Export to Other Deconvolution Tools

```{r export, eval=FALSE}
# Get selected genes for use with other tools
genes <- dw$get_genes()
selection <- dw$get_selection()

# Create signature matrix for CIBERSORT
signature_matrix <- reference[, selection]
write.csv(signature_matrix, "signature_matrix.csv")

# Or as gene list for MuSiC
gene_list <- genes
```

## Troubleshooting

### Common Issues

1. **Slow optimization**: Reduce `ngen`, `pop_size`, or enable `parallel = TRUE`
2. **Poor convergence**: Increase `ngen` or `pop_size`
3. **All solutions identical**: Increase `mutation_prob`
4. **Memory issues**: Use gene pre-selection with `use_highly_variable = TRUE`

### Diagnostic Checks

```{r diagnostics}
# Check optimization quality
fitness <- dw$get_fitness()

cat("Diagnostic Summary:\n")
cat("  Pareto front size:", nrow(fitness), "\n")
cat("  Correlation range:", round(diff(range(fitness[,1])), 3), "\n")
cat("  Distance range:", round(diff(range(fitness[,2])), 3), "\n")
cat("  Gene count range:", range(sapply(dw$get_pareto(), sum)), "\n")
```

## Session Info

```{r session}
sessionInfo()
```