--- title: "Quick Start Guide" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 2 vignette: > %\VignetteIndexEntry{Quick Start Guide} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE, eval = FALSE ) set.seed(42) ``` ## Introduction This vignette demonstrates the core functionality of **scClustEval** with executable examples using simulated data. ## Installation ```{r install, eval=FALSE} # From R-universe (recommended) install.packages("scClustEval", repos = "https://zaoqu-liu.r-universe.dev") # From GitHub remotes::install_github("Zaoqu-Liu/scClustEval") ``` ## Loading the Package ```{r load} library(scClustEval) library(ggplot2) ``` ## Creating Example Data Let's create a synthetic dataset with known cluster structure to demonstrate the assessment workflow. ```{r create_data} # Generate synthetic single-cell data set.seed(42) n_cells <- 600 n_features <- 50 n_clusters <- 4 # Create expression matrix with distinct clusters X <- matrix(nrow = n_cells, ncol = n_features) labels <- character(n_cells) cells_per_cluster <- n_cells / n_clusters for (i in 1:n_clusters) { start_idx <- (i - 1) * cells_per_cluster + 1 end_idx <- i * cells_per_cluster # Each cluster has a distinct mean expression profile cluster_mean <- rnorm(n_features, mean = i * 2, sd = 0.5) X[start_idx:end_idx, ] <- matrix( rep(cluster_mean, cells_per_cluster) + rnorm(cells_per_cluster * n_features, sd = 1), nrow = cells_per_cluster, byrow = TRUE ) labels[start_idx:end_idx] <- paste0("Cluster_", i) } colnames(X) <- paste0("Gene_", 1:n_features) cat("Data dimensions:", nrow(X), "cells x", ncol(X), "features\n") cat("Clusters:", unique(labels), "\n") ``` ## Basic Assessment ### Running Self-Projection ```{r assessment} # Run clustering assessment result <- sc_assessment( X = X, labels = labels, classifier = "LR", # Logistic Regression penalty = "l1", # L1 regularization (Lasso) test_size = 0.5, # 50% for testing n_per_class = 100, # Max 100 cells per cluster in training cv = 5, # 5-fold cross-validation seed = 42, verbose = TRUE ) # View summary print(result) ``` ### Understanding Results ```{r results_explain} # Key metrics cat("\n=== Key Metrics ===\n") cat("Test Accuracy:", sprintf("%.1f%%", result$accuracy * 100), "\n") cat("CV Accuracy:", sprintf("%.1f%%", result$cv_accuracy * 100), "\n") cat("Max R1 Confusion:", sprintf("%.4f", result$max_r1), "\n") cat("Max R2 Confusion:", sprintf("%.4f", result$max_r2), "\n") # Per-cluster accuracy cat("\n=== Per-Cluster Accuracy ===\n") for (cl in names(result$per_class_accuracy)) { cat(sprintf(" %s: %.1f%%\n", cl, result$per_class_accuracy[cl] * 100)) } ``` ## Visualization ### ROC Curves ```{r roc_curves, fig.height=5, fig.width=10} # Plot ROC and Precision-Recall curves plot_roc(result, plot_type = "both", show_auc = TRUE) ``` ### Confusion Matrix Heatmaps ```{r confusion_heatmaps, fig.height=5, fig.width=12} library(gridExtra) # Raw confusion matrix p1 <- plot_confusion_heatmap(result, normalized = "raw", title = "Raw Counts") # R1-normalized p2 <- plot_confusion_heatmap(result, normalized = "R1", title = "R1 Normalized") # R2-normalized p3 <- plot_confusion_heatmap(result, normalized = "R2", title = "R2 Normalized") grid.arrange(p1, p2, p3, ncol = 3) ``` ## Simulating Over-Clustering Now let's create an over-clustered scenario to demonstrate optimization. ```{r overclustering} # Split some clusters to simulate over-clustering labels_over <- labels # Split Cluster_1 into two labels_over[labels == "Cluster_1"][1:75] <- "Cluster_1a" labels_over[labels == "Cluster_1"][76:150] <- "Cluster_1b" # Split Cluster_2 into two labels_over[labels == "Cluster_2"][1:75] <- "Cluster_2a" labels_over[labels == "Cluster_2"][76:150] <- "Cluster_2b" labels_over <- as.character(labels_over) cat("Over-clustered labels:", unique(labels_over), "\n") cat("Number of clusters:", length(unique(labels_over)), "\n") ``` ### Assessment of Over-Clustered Data ```{r assess_over} # Assess the over-clustered data result_over <- sc_assessment( X = X, labels = labels_over, classifier = "LR", n_per_class = 50, cv = 5, seed = 42, verbose = TRUE ) cat("\nOver-clustering accuracy:", sprintf("%.1f%%", result_over$accuracy * 100), "\n") cat("Max R1 (indicates confusion):", sprintf("%.4f", result_over$max_r1), "\n") ``` ```{r confusion_over, fig.height=5} # Show confusion between artificial splits plot_confusion_heatmap(result_over, normalized = "R1", title = "R1 Confusion (Over-clustered)") ``` ## Single Optimization Round ```{r optimize_single} # Run single optimization round optim_round <- sc_optimize( X = X, labels = labels_over, classifier = "LR", n_iter = 3, # 3 iterations for confusion matrix r1_cutoff = 0.1, # Merge if R1 > 0.1 r2_cutoff = 0.05, # Or if R2 > 0.05 seed = 42, verbose = TRUE ) cat("\nClusters before:", optim_round$n_clusters_before, "\n") cat("Clusters after:", optim_round$n_clusters_after, "\n") cat("Accuracy:", sprintf("%.1f%%", optim_round$accuracy * 100), "\n") ``` ## Full Optimization Pipeline ```{r optimize_full} # Run full optimization optim_result <- sc_optimize_all( X = X, labels = labels_over, min_accuracy = 0.90, # Target 90% accuracy max_rounds = 10, classifier = "LR", r1_cutoff = 0.5, # Start with high cutoff r2_cutoff = 0.05, seed = 42, verbose = TRUE ) # Summary print(optim_result) ``` ### Optimization History ```{r optim_history, fig.height=5} # Plot optimization progress plot_optimization_history(optim_result, metric = "both") ``` ### Compare Before and After ```{r compare_results} # Final cluster distribution cat("\n=== Optimization Summary ===\n") cat("Initial clusters:", length(unique(labels_over)), "\n") cat("Final clusters:", length(unique(optim_result$final_labels)), "\n") cat("Final accuracy:", sprintf("%.1f%%", optim_result$final_accuracy * 100), "\n") # Cluster mapping cat("\n=== Final Cluster Sizes ===\n") print(table(optim_result$final_labels)) ``` ### Sankey Diagram ```{r sankey, fig.height=6, eval=FALSE} # Visualize cluster reassignment if (requireNamespace("ggalluvial", quietly = TRUE)) { plot_cluster_sankey( labels_from = labels_over, labels_to = as.character(optim_result$final_labels), title = "Cluster Optimization Flow" ) } ``` ## Using Different Classifiers ```{r classifiers_compare} # List available classifiers get_available_classifiers() ``` ```{r rf_example} # Try Random Forest result_rf <- sc_assessment( X = X, labels = labels, classifier = "RF", n_per_class = 100, cv = 0, # Skip CV for speed seed = 42, verbose = FALSE ) cat("Random Forest accuracy:", sprintf("%.1f%%", result_rf$accuracy * 100), "\n") ``` ## Summary This quick start guide demonstrated: 1. **Creating test data** with known cluster structure 2. **Running assessment** with `sc_assessment()` 3. **Visualizing results** with ROC curves and confusion matrices 4. **Simulating over-clustering** scenarios 5. **Optimizing clustering** with `sc_optimize_all()` 6. **Comparing classifiers** For more advanced usage, see the other vignettes: - [Algorithm Principles](algorithm.html) - Mathematical foundations - [Seurat Integration](seurat-integration.html) - Working with Seurat objects - [Visualization Guide](visualization.html) - Comprehensive plotting --- **Author**: Zaoqu Liu (liuzaoqu@163.com) **Package**: scClustEval v`r packageVersion("scClustEval")`