---
title: "Visualization Guide"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 3
    fig_caption: true
vignette: >
  %\VignetteIndexEntry{Visualization Guide}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE
)
```

## Introduction

This vignette demonstrates various visualization techniques for analyzing darwin optimization results. Effective visualization is crucial for understanding the trade-offs in multi-objective optimization and selecting appropriate solutions.

## Setup

```{r load-packages}
library(darwin)
library(ggplot2)

# Set seed for reproducibility
set.seed(42)
```

## Prepare Example Data

```{r create-data}
# Create reference expression matrix
n_celltypes <- 6
n_genes <- 300

reference <- matrix(
  abs(rnorm(n_celltypes * n_genes, mean = 2)),
  nrow = n_celltypes,
  ncol = n_genes
)
rownames(reference) <- c("B_cells", "T_cells", "NK_cells", "Monocytes", "Dendritic", "Neutrophils")
colnames(reference) <- paste0("Gene", 1:n_genes)

# Add cell-type specific markers
for (i in 1:n_celltypes) {
  markers <- ((i - 1) * 15 + 1):(i * 15)
  reference[i, markers] <- reference[i, markers] + runif(15, 3, 6)
}

# Run optimization
dw <- darwin(reference)
dw$optimize(
  ngen = 80,
  pop_size = 80,
  objectives = c("correlation", "distance"),
  weights = c(-1, 1),
  verbose = FALSE,
  parallel = FALSE
)
```

## Pareto Front Visualization

### Basic Pareto Plot

```{r pareto-basic, fig.cap="Basic Pareto front visualization showing the trade-off between objectives."}
dw$plot()
```

### Customized Pareto Plot

```{r pareto-custom, fig.cap="Customized Pareto front with different highlighted solution."}
# Highlight solution with best distance
dw$plot(
  index = c(2, -1),  # Objective 2, last rank (highest distance)
  point_size = 4,
  highlight_size = 7
)
```

### Manual Pareto Plot with ggplot2

```{r pareto-ggplot, fig.cap="Fully customized Pareto front visualization."}
# Get fitness data
fitness <- dw$get_fitness()
pareto <- dw$get_pareto()
n_genes_per_solution <- sapply(pareto, sum)

# Create data frame
df <- data.frame(
  correlation = fitness$correlation,
  distance = fitness$distance,
  n_genes = n_genes_per_solution,
  solution_id = 1:nrow(fitness)
)

# Custom plot
ggplot(df, aes(x = correlation, y = distance)) +
  geom_point(aes(size = n_genes, color = n_genes), alpha = 0.7) +
  geom_line(color = "gray50", alpha = 0.5) +
  scale_color_viridis_c(option = "plasma") +
  scale_size_continuous(range = c(2, 8)) +
  labs(
    title = "Pareto Front Analysis",
    subtitle = paste(nrow(df), "Pareto-optimal solutions"),
    x = "Correlation (lower is better)",
    y = "Distance (higher is better)",
    color = "Number of\nGenes",
    size = "Number of\nGenes"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "right"
  )
```

## Gene Selection Analysis

### Gene Count Distribution

```{r gene-count, fig.cap="Distribution of selected gene counts across Pareto-optimal solutions."}
df_genes <- data.frame(n_genes = n_genes_per_solution)

ggplot(df_genes, aes(x = n_genes)) +
  geom_histogram(bins = 20, fill = "#3498db", color = "white", alpha = 0.8) +
  geom_vline(xintercept = median(n_genes_per_solution), 
             color = "#e74c3c", linetype = "dashed", linewidth = 1) +
  annotate("text", x = median(n_genes_per_solution) + 5, y = Inf, 
           label = paste("Median:", median(n_genes_per_solution)), 
           vjust = 2, color = "#e74c3c") +
  labs(
    title = "Distribution of Selected Gene Counts",
    subtitle = "Across all Pareto-optimal solutions",
    x = "Number of Selected Genes",
    y = "Frequency"
  ) +
  theme_minimal(base_size = 12)
```

### Fitness vs Gene Count

```{r fitness-vs-genes, fig.cap="Relationship between number of genes and objective values."}
library(ggplot2)

# Long format for faceting
df_long <- rbind(
  data.frame(n_genes = n_genes_per_solution, 
             value = fitness$correlation, 
             objective = "Correlation"),
  data.frame(n_genes = n_genes_per_solution, 
             value = fitness$distance, 
             objective = "Distance")
)

ggplot(df_long, aes(x = n_genes, y = value)) +
  geom_point(alpha = 0.6, color = "#3498db") +
  geom_smooth(method = "loess", se = TRUE, color = "#e74c3c") +
  facet_wrap(~objective, scales = "free_y") +
  labs(
    title = "Objectives vs Number of Selected Genes",
    x = "Number of Selected Genes",
    y = "Objective Value"
  ) +
  theme_minimal(base_size = 12)
```

## Expression Profile Visualization

### Heatmap of Selected Genes

```{r heatmap, fig.cap="Expression heatmap of selected marker genes."}
# Select a solution
dw$select(weights = c(-1, 1))
selection <- dw$get_selection()
selected_data <- reference[, selection]

# For visualization, show top 50 most variable genes
gene_vars <- apply(selected_data, 2, var)
top_genes <- names(sort(gene_vars, decreasing = TRUE))[1:min(50, ncol(selected_data))]
plot_data <- selected_data[, top_genes]

# Scale for visualization
plot_data_scaled <- t(scale(t(plot_data)))

# Convert to long format
df_heat <- expand.grid(
  CellType = rownames(plot_data_scaled),
  Gene = colnames(plot_data_scaled)
)
df_heat$Expression <- as.vector(plot_data_scaled)

ggplot(df_heat, aes(x = Gene, y = CellType, fill = Expression)) +
  geom_tile() +
  scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c", 
                       midpoint = 0, limits = c(-3, 3), oob = scales::squish) +
  labs(
    title = "Expression Heatmap of Selected Genes",
    subtitle = paste("Top", length(top_genes), "most variable genes"),
    x = "Genes",
    y = "Cell Type",
    fill = "Scaled\nExpression"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )
```

### Cell Type Similarity

```{r similarity, fig.cap="Cell type similarity based on selected genes."}
# Compute correlation matrix
corr_selected <- cor(t(selected_data))

df_corr <- expand.grid(
  CT1 = rownames(corr_selected),
  CT2 = colnames(corr_selected)
)
df_corr$Correlation <- as.vector(corr_selected)

ggplot(df_corr, aes(x = CT1, y = CT2, fill = Correlation)) +
  geom_tile() +
  geom_text(aes(label = round(Correlation, 2)), size = 3) +
  scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c", 
                       midpoint = 0, limits = c(-1, 1)) +
  labs(
    title = "Cell Type Correlation Matrix",
    subtitle = "Based on selected marker genes",
    x = "", y = ""
  ) +
  theme_minimal(base_size = 12) +
  coord_fixed() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

## Solution Comparison

### Compare Different Selection Methods

```{r compare-methods, fig.cap="Comparing solutions from different selection methods."}
# Get multiple solutions
dw$select(weights = c(-1, 1))
sol_weighted <- dw$get_selection()

dw$select(index = 1)
sol_first <- dw$get_selection()

dw$select(index = c(1, 1))  # Best correlation
sol_best_corr <- dw$get_selection()

dw$select(index = c(2, -1))  # Best distance
sol_best_dist <- dw$get_selection()

# Compare
comparison <- data.frame(
  Method = c("Weighted", "First", "Best Correlation", "Best Distance"),
  N_Genes = c(sum(sol_weighted), sum(sol_first), sum(sol_best_corr), sum(sol_best_dist)),
  Correlation = c(
    compute_correlation(reference[, sol_weighted]),
    compute_correlation(reference[, sol_first]),
    compute_correlation(reference[, sol_best_corr]),
    compute_correlation(reference[, sol_best_dist])
  ),
  Distance = c(
    compute_distance(reference[, sol_weighted]),
    compute_distance(reference[, sol_first]),
    compute_distance(reference[, sol_best_corr]),
    compute_distance(reference[, sol_best_dist])
  )
)

knitr::kable(comparison, digits = 2, caption = "Comparison of different selection methods")
```

```{r compare-plot, fig.cap="Visual comparison of selection methods on the Pareto front."}
# Plot comparison
df_comp <- data.frame(
  Correlation = comparison$Correlation,
  Distance = comparison$Distance,
  Method = comparison$Method
)

ggplot(df, aes(x = correlation, y = distance)) +
  geom_point(color = "gray70", alpha = 0.5, size = 2) +
  geom_line(color = "gray70", alpha = 0.5) +
  geom_point(data = df_comp, aes(x = Correlation, y = Distance, color = Method), 
             size = 5) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Solution Comparison on Pareto Front",
    x = "Correlation",
    y = "Distance"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")
```

## Advanced: 3D Pareto Front

For three objectives, we can visualize in 3D:

```{r 3d-example, eval=FALSE}
# Example with 3 objectives (not run)
dw3 <- darwin(reference)
dw3$optimize(
  ngen = 50,
  objectives = c("correlation", "distance", "condition"),
  weights = c(-1, 1, -1),
  verbose = FALSE
)

# Would use plotly for 3D visualization
# library(plotly)
# fitness3 <- dw3$get_fitness()
# plot_ly(fitness3, x = ~correlation, y = ~distance, z = ~condition,
#         type = "scatter3d", mode = "markers")
```

## Summary

Effective visualization of darwin results helps in:

1. **Understanding trade-offs** between objectives
2. **Comparing** different selection strategies
3. **Validating** that selected genes provide good cell type separation
4. **Communicating** results to collaborators

## Session Info

```{r session}
sessionInfo()
```