---
title: "SpaGER: Visualization and Analysis"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 3
vignette: >
  %\VignetteIndexEntry{SpaGER: Visualization and Analysis}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  message = FALSE,
  warning = FALSE
)
```

## Introduction

This vignette demonstrates visualization techniques for SpaGER predictions and analyses, helping you interpret and present your results effectively.
  
```{r load}
library(SpaGER)
set.seed(42)
```

## Simulated Dataset

We create a more realistic simulated dataset with biological structure:

```{r simulate}
# Parameters
n_rna <- 500
n_spatial <- 200
n_shared <- 80
n_predict <- 40

# Create cell type structure (3 cell types)
cell_types <- rep(1:3, length.out = n_rna)
type_means <- c(3, 5, 7)

# scRNA-seq with cell type-specific expression
rna_base <- matrix(0, nrow = n_rna, ncol = n_shared + n_predict)
for (i in 1:n_rna) {
  rna_base[i, ] <- rnorm(n_shared + n_predict, mean = type_means[cell_types[i]], sd = 1)
}
rna_base <- abs(rna_base)
colnames(rna_base) <- c(paste0("Shared", 1:n_shared), paste0("Novel", 1:n_predict))

# Spatial data - mixture of cell types
spatial_types <- sample(1:3, n_spatial, replace = TRUE, prob = c(0.5, 0.3, 0.2))
spatial_base <- matrix(0, nrow = n_spatial, ncol = n_shared)
for (i in 1:n_spatial) {
  spatial_base[i, ] <- rnorm(n_shared, mean = type_means[spatial_types[i]], sd = 1.2)
}
spatial_base <- abs(spatial_base)
colnames(spatial_base) <- paste0("Shared", 1:n_shared)

# Add spatial coordinates
coords <- data.frame(
  x = runif(n_spatial, 0, 100),
  y = runif(n_spatial, 0, 100)
)
rownames(coords) <- paste0("Spot", 1:n_spatial)
rownames(spatial_base) <- rownames(coords)

cat("Data created:\n")
cat("  scRNA-seq:", n_rna, "cells,", ncol(rna_base), "genes\n")
cat("  Spatial:", n_spatial, "spots,", ncol(spatial_base), "genes\n")
```

## Run SpaGE Prediction

```{r predict}
predicted <- SpaGE(
  spatial_data = as.data.frame(spatial_base),
  rna_data = as.data.frame(rna_base),
  n_pv = 30,
  n_neighbors = 50,
  verbose = FALSE
)

cat("Predicted", ncol(predicted), "novel genes\n")
```

## Visualizing Principal Vector Selection

```{r pv_vis, fig.width=8, fig.height=4}
# Get PV information
similarities <- attr(predicted, "similarities")
n_pv_used <- attr(predicted, "n_pv_used")

par(mfrow = c(1, 2))

# Bar plot of similarities
cols <- ifelse(seq_along(similarities) <= n_pv_used, "#3498db", "#bdc3c7")
barplot(similarities, 
        names.arg = seq_along(similarities),
        col = cols,
        border = "white",
        main = "Principal Vector Selection",
        xlab = "PV Index",
        ylab = "Cosine Similarity",
        ylim = c(0, 1))
abline(h = 0.3, col = "#e74c3c", lty = 2, lwd = 2)
legend("topright", 
       legend = c("Selected", "Rejected", "Threshold (0.3)"),
       fill = c("#3498db", "#bdc3c7", NA),
       border = c("white", "white", NA),
       lty = c(NA, NA, 2),
       col = c(NA, NA, "#e74c3c"))

# Cumulative variance
cum_var <- cumsum(similarities^2) / sum(similarities^2)
plot(cum_var, type = "b", pch = 19, col = "#2ecc71",
     main = "Cumulative Variance Captured",
     xlab = "Number of PVs",
     ylab = "Cumulative Proportion",
     xlim = c(1, length(similarities)))
abline(v = n_pv_used, col = "#e74c3c", lty = 2)
abline(h = cum_var[n_pv_used], col = "#e74c3c", lty = 2)
text(n_pv_used + 1, cum_var[n_pv_used], 
     sprintf("%.1f%%", cum_var[n_pv_used] * 100), pos = 4)

par(mfrow = c(1, 1))
```

## Spatial Expression Patterns

```{r spatial_vis, fig.width=8, fig.height=6}
# Function to plot spatial expression
plot_spatial <- function(coords, values, title, col_palette = NULL) {
  if (is.null(col_palette)) {
    col_palette <- colorRampPalette(c("#313695", "#4575b4", "#74add1", 
                                       "#abd9e9", "#ffffbf", "#fee090",
                                       "#fdae61", "#f46d43", "#d73027"))(100)
  }
  
  # Normalize values to [0, 1]
  val_norm <- (values - min(values)) / (max(values) - min(values) + 1e-10)
  cols <- col_palette[ceiling(val_norm * 99) + 1]
  
  plot(coords$x, coords$y, pch = 19, cex = 1.5,
       col = cols, xlab = "X", ylab = "Y", main = title,
       asp = 1)
}

# Plot predicted genes
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))

for (i in 1:4) {
  gene <- colnames(predicted)[i]
  plot_spatial(coords, predicted[, gene], paste("Predicted:", gene))
}

par(mfrow = c(1, 1))
```

## Cross-Validation Results

```{r cv_analysis, fig.width=8, fig.height=5}
# Run CV
cv_results <- SpaGE_cv(
  spatial_data = as.data.frame(spatial_base),
  rna_data = as.data.frame(rna_base[, paste0("Shared", 1:n_shared)]),
  n_pv = 20,
  genes = paste0("Shared", 1:20),
  verbose = FALSE
)

par(mfrow = c(1, 2))

# Histogram
hist(cv_results$correlation, breaks = 15, col = "#3498db", border = "white",
     main = "Cross-Validation Performance",
     xlab = "Spearman Correlation",
     xlim = c(-0.2, 1))
abline(v = median(cv_results$correlation), col = "#e74c3c", lwd = 2, lty = 2)
abline(v = mean(cv_results$correlation), col = "#2ecc71", lwd = 2, lty = 2)
legend("topleft", 
       legend = c(paste("Median:", round(median(cv_results$correlation), 3)),
                  paste("Mean:", round(mean(cv_results$correlation), 3))),
       col = c("#e74c3c", "#2ecc71"), lty = 2, lwd = 2)

# Per-gene plot
cv_ordered <- cv_results[order(cv_results$correlation, decreasing = TRUE), ]
barplot(cv_ordered$correlation, 
        names.arg = substr(cv_ordered$gene, 7, 10),
        las = 2,
        col = ifelse(cv_ordered$correlation > 0.3, "#2ecc71", "#e74c3c"),
        border = "white",
        main = "Per-Gene Correlation",
        ylab = "Spearman Correlation")
abline(h = 0.3, lty = 2)

par(mfrow = c(1, 1))
```

## Measured vs Predicted Scatter Plots

```{r scatter, fig.width=8, fig.height=4}
# For CV genes, compare measured vs predicted
par(mfrow = c(1, 3))

for (i in 1:3) {
  gene <- cv_results$gene[i]
  
  # Get measured values
  measured <- spatial_base[, gene]
  
  # Run prediction for this gene
  spatial_minus <- spatial_base[, setdiff(colnames(spatial_base), gene)]
  pred_single <- SpaGE(
    spatial_data = as.data.frame(spatial_minus),
    rna_data = as.data.frame(rna_base[, paste0("Shared", 1:n_shared)]),
    n_pv = 20,
    genes_to_predict = gene,
    verbose = FALSE
  )
  
  predicted_vals <- pred_single[, gene]
  cor_val <- cor(measured, predicted_vals, method = "spearman")
  
  plot(measured, predicted_vals, pch = 19, col = adjustcolor("#3498db", 0.5),
       xlab = "Measured", ylab = "Predicted",
       main = sprintf("%s (r = %.3f)", gene, cor_val))
  abline(lm(predicted_vals ~ measured), col = "#e74c3c", lwd = 2)
  
  # Add identity line
  abline(0, 1, lty = 2, col = "gray50")
}

par(mfrow = c(1, 1))
```

## Expression Distribution Comparison

```{r distribution, fig.width=8, fig.height=4}
par(mfrow = c(1, 2))

# Density plot for predicted genes
gene1 <- colnames(predicted)[1]
gene2 <- colnames(predicted)[2]

# Use density estimation
d1 <- density(predicted[, gene1])
d2 <- density(predicted[, gene2])

plot(d1, col = "#3498db", lwd = 2, 
     main = "Predicted Expression Distribution",
     xlab = "Expression Level",
     xlim = range(c(d1$x, d2$x)),
     ylim = range(c(d1$y, d2$y)))
lines(d2, col = "#e74c3c", lwd = 2)
legend("topright", legend = c(gene1, gene2), 
       col = c("#3498db", "#e74c3c"), lwd = 2)

# Box plot
boxplot(predicted[, 1:10], 
        col = "#3498db",
        border = "#2c3e50",
        main = "Predicted Expression (First 10 Genes)",
        xlab = "Gene",
        ylab = "Expression",
        las = 2,
        names = substr(colnames(predicted)[1:10], 1, 8))

par(mfrow = c(1, 1))
```

## Correlation Heatmap

```{r heatmap, fig.width=7, fig.height=6}
# Compute correlation matrix for predicted genes
pred_subset <- predicted[, 1:15]
cor_mat <- cor(pred_subset, method = "spearman")

# Simple heatmap
heatmap(cor_mat, 
        col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(50),
        scale = "none",
        main = "Gene-Gene Correlation (Predicted)",
        margins = c(8, 8))
```

## Summary Statistics

```{r summary}
cat("=== SpaGE Prediction Summary ===\n\n")

cat("Input Data:\n")
cat(sprintf("  Spatial spots: %d\n", nrow(spatial_base)))
cat(sprintf("  scRNA-seq cells: %d\n", nrow(rna_base)))
cat(sprintf("  Shared genes: %d\n", n_shared))
cat(sprintf("  Novel genes predicted: %d\n", ncol(predicted)))

cat("\nPrincipal Vectors:\n")
cat(sprintf("  Requested: %d\n", attr(predicted, "n_pv")))
cat(sprintf("  Used (sim > 0.3): %d\n", attr(predicted, "n_pv_used")))
cat(sprintf("  Mean similarity: %.3f\n", mean(attr(predicted, "similarities"))))

cat("\nCross-Validation:\n")
cat(sprintf("  Genes tested: %d\n", nrow(cv_results)))
cat(sprintf("  Mean correlation: %.3f\n", mean(cv_results$correlation)))
cat(sprintf("  Median correlation: %.3f\n", median(cv_results$correlation)))
cat(sprintf("  Genes with r > 0.3: %d (%.1f%%)\n", 
            sum(cv_results$correlation > 0.3),
            100 * mean(cv_results$correlation > 0.3)))

cat("\nPredicted Expression:\n")
cat(sprintf("  Mean: %.3f\n", mean(as.matrix(predicted))))
cat(sprintf("  SD: %.3f\n", sd(as.matrix(predicted))))
cat(sprintf("  Range: [%.3f, %.3f]\n", min(predicted), max(predicted)))
```

## Exporting Results

```{r export, eval=FALSE}
# Export predicted expression
write.csv(predicted, "spage_predictions.csv")

# Export CV results
write.csv(cv_results, "spage_cv_results.csv")

# Export for visualization in other tools
saveRDS(list(
  predictions = predicted,
  coordinates = coords,
  cv_results = cv_results,
  metadata = list(
    n_pv = attr(predicted, "n_pv"),
    n_pv_used = attr(predicted, "n_pv_used"),
    similarities = attr(predicted, "similarities")
  )
), "spage_results.rds")
```

## Session Information

```{r session}
sessionInfo()
```