---
title: "Quick Start Guide"
author: "Zaoqu Liu"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 2
vignette: >
  %\VignetteIndexEntry{Quick Start Guide}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE,
  eval = FALSE
)
set.seed(42)
```

## Introduction

This vignette demonstrates the core functionality of **scClustEval** with executable examples using simulated data.

## Installation

```{r install, eval=FALSE}
# From R-universe (recommended)
install.packages("scClustEval", repos = "https://zaoqu-liu.r-universe.dev")

# From GitHub
remotes::install_github("Zaoqu-Liu/scClustEval")
```

## Loading the Package

```{r load}
library(scClustEval)
library(ggplot2)
```

## Creating Example Data

Let's create a synthetic dataset with known cluster structure to demonstrate the assessment workflow.

```{r create_data}
# Generate synthetic single-cell data
set.seed(42)
n_cells <- 600
n_features <- 50
n_clusters <- 4

# Create expression matrix with distinct clusters
X <- matrix(nrow = n_cells, ncol = n_features)
labels <- character(n_cells)
cells_per_cluster <- n_cells / n_clusters

for (i in 1:n_clusters) {
  start_idx <- (i - 1) * cells_per_cluster + 1
  end_idx <- i * cells_per_cluster
  
  # Each cluster has a distinct mean expression profile
  cluster_mean <- rnorm(n_features, mean = i * 2, sd = 0.5)
  X[start_idx:end_idx, ] <- matrix(
    rep(cluster_mean, cells_per_cluster) + rnorm(cells_per_cluster * n_features, sd = 1),
    nrow = cells_per_cluster,
    byrow = TRUE
  )
  labels[start_idx:end_idx] <- paste0("Cluster_", i)
}

colnames(X) <- paste0("Gene_", 1:n_features)

cat("Data dimensions:", nrow(X), "cells x", ncol(X), "features\n")
cat("Clusters:", unique(labels), "\n")
```

## Basic Assessment

### Running Self-Projection

```{r assessment}
# Run clustering assessment
result <- sc_assessment(
  X = X,
  labels = labels,
  classifier = "LR",    # Logistic Regression
  penalty = "l1",       # L1 regularization (Lasso)
  test_size = 0.5,      # 50% for testing
  n_per_class = 100,    # Max 100 cells per cluster in training
  cv = 5,               # 5-fold cross-validation
  seed = 42,
  verbose = TRUE
)

# View summary
print(result)
```

### Understanding Results

```{r results_explain}
# Key metrics
cat("\n=== Key Metrics ===\n")
cat("Test Accuracy:", sprintf("%.1f%%", result$accuracy * 100), "\n")
cat("CV Accuracy:", sprintf("%.1f%%", result$cv_accuracy * 100), "\n")
cat("Max R1 Confusion:", sprintf("%.4f", result$max_r1), "\n")
cat("Max R2 Confusion:", sprintf("%.4f", result$max_r2), "\n")

# Per-cluster accuracy
cat("\n=== Per-Cluster Accuracy ===\n")
for (cl in names(result$per_class_accuracy)) {
  cat(sprintf("  %s: %.1f%%\n", cl, result$per_class_accuracy[cl] * 100))
}
```

## Visualization

### ROC Curves

```{r roc_curves, fig.height=5, fig.width=10}
# Plot ROC and Precision-Recall curves
plot_roc(result, plot_type = "both", show_auc = TRUE)
```

### Confusion Matrix Heatmaps

```{r confusion_heatmaps, fig.height=5, fig.width=12}
library(gridExtra)

# Raw confusion matrix
p1 <- plot_confusion_heatmap(result, normalized = "raw", title = "Raw Counts")

# R1-normalized 
p2 <- plot_confusion_heatmap(result, normalized = "R1", title = "R1 Normalized")

# R2-normalized
p3 <- plot_confusion_heatmap(result, normalized = "R2", title = "R2 Normalized")

grid.arrange(p1, p2, p3, ncol = 3)
```

## Simulating Over-Clustering

Now let's create an over-clustered scenario to demonstrate optimization.

```{r overclustering}
# Split some clusters to simulate over-clustering
labels_over <- labels
# Split Cluster_1 into two
labels_over[labels == "Cluster_1"][1:75] <- "Cluster_1a"
labels_over[labels == "Cluster_1"][76:150] <- "Cluster_1b"
# Split Cluster_2 into two  
labels_over[labels == "Cluster_2"][1:75] <- "Cluster_2a"
labels_over[labels == "Cluster_2"][76:150] <- "Cluster_2b"

labels_over <- as.character(labels_over)
cat("Over-clustered labels:", unique(labels_over), "\n")
cat("Number of clusters:", length(unique(labels_over)), "\n")
```

### Assessment of Over-Clustered Data

```{r assess_over}
# Assess the over-clustered data
result_over <- sc_assessment(
  X = X,
  labels = labels_over,
  classifier = "LR",
  n_per_class = 50,
  cv = 5,
  seed = 42,
  verbose = TRUE
)

cat("\nOver-clustering accuracy:", sprintf("%.1f%%", result_over$accuracy * 100), "\n")
cat("Max R1 (indicates confusion):", sprintf("%.4f", result_over$max_r1), "\n")
```

```{r confusion_over, fig.height=5}
# Show confusion between artificial splits
plot_confusion_heatmap(result_over, normalized = "R1", 
                       title = "R1 Confusion (Over-clustered)")
```

## Single Optimization Round

```{r optimize_single}
# Run single optimization round
optim_round <- sc_optimize(
  X = X,
  labels = labels_over,
  classifier = "LR",
  n_iter = 3,          # 3 iterations for confusion matrix
  r1_cutoff = 0.1,     # Merge if R1 > 0.1
  r2_cutoff = 0.05,    # Or if R2 > 0.05
  seed = 42,
  verbose = TRUE
)

cat("\nClusters before:", optim_round$n_clusters_before, "\n")
cat("Clusters after:", optim_round$n_clusters_after, "\n")
cat("Accuracy:", sprintf("%.1f%%", optim_round$accuracy * 100), "\n")
```

## Full Optimization Pipeline

```{r optimize_full}
# Run full optimization
optim_result <- sc_optimize_all(
  X = X,
  labels = labels_over,
  min_accuracy = 0.90,   # Target 90% accuracy
  max_rounds = 10,
  classifier = "LR",
  r1_cutoff = 0.5,       # Start with high cutoff
  r2_cutoff = 0.05,
  seed = 42,
  verbose = TRUE
)

# Summary
print(optim_result)
```

### Optimization History

```{r optim_history, fig.height=5}
# Plot optimization progress
plot_optimization_history(optim_result, metric = "both")
```

### Compare Before and After

```{r compare_results}
# Final cluster distribution
cat("\n=== Optimization Summary ===\n")
cat("Initial clusters:", length(unique(labels_over)), "\n")
cat("Final clusters:", length(unique(optim_result$final_labels)), "\n")
cat("Final accuracy:", sprintf("%.1f%%", optim_result$final_accuracy * 100), "\n")

# Cluster mapping
cat("\n=== Final Cluster Sizes ===\n")
print(table(optim_result$final_labels))
```

### Sankey Diagram

```{r sankey, fig.height=6, eval=FALSE}
# Visualize cluster reassignment
if (requireNamespace("ggalluvial", quietly = TRUE)) {
  plot_cluster_sankey(
    labels_from = labels_over,
    labels_to = as.character(optim_result$final_labels),
    title = "Cluster Optimization Flow"
  )
}
```

## Using Different Classifiers

```{r classifiers_compare}
# List available classifiers
get_available_classifiers()
```

```{r rf_example}
# Try Random Forest
result_rf <- sc_assessment(
  X = X,
  labels = labels,
  classifier = "RF",
  n_per_class = 100,
  cv = 0,  # Skip CV for speed
  seed = 42,
  verbose = FALSE
)

cat("Random Forest accuracy:", sprintf("%.1f%%", result_rf$accuracy * 100), "\n")
```

## Summary

This quick start guide demonstrated:

1. **Creating test data** with known cluster structure
2. **Running assessment** with `sc_assessment()`
3. **Visualizing results** with ROC curves and confusion matrices
4. **Simulating over-clustering** scenarios
5. **Optimizing clustering** with `sc_optimize_all()`
6. **Comparing classifiers**

For more advanced usage, see the other vignettes:

- [Algorithm Principles](algorithm.html) - Mathematical foundations
- [Seurat Integration](seurat-integration.html) - Working with Seurat objects
- [Visualization Guide](visualization.html) - Comprehensive plotting

---

**Author**: Zaoqu Liu (liuzaoqu@163.com)  
**Package**: scClustEval v`r packageVersion("scClustEval")`