--- title: "Velocity Model Comparison" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 2 fig_width: 8 fig_height: 6 vignette: > %\VignetteIndexEntry{Velocity Model Comparison} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, message = FALSE, warning = FALSE ) ``` ## Introduction **scVeloR** implements three velocity estimation models, each with distinct assumptions and use cases. This vignette provides a comprehensive comparison to help you choose the appropriate model for your data. ## Model Overview ### 1. Deterministic (Steady-State) Model **Assumption**: Cells are near transcriptional equilibrium. **Key equation**: $$\gamma \approx \frac{\beta u}{s}$$ **Velocity**: $$v_s = \beta u - \gamma s$$ **Pros**: - Fast computation - Requires minimal data - Works well for slowly changing processes **Cons**: - May miss transient dynamics - Assumes equilibrium ### 2. Stochastic Model **Assumption**: Accounts for transcriptional bursting and noise. **Uses**: First and second-order moments (mean, variance, covariance) **Pros**: - More robust to noise - Better for bursty transcription - Intermediate computational cost **Cons**: - Still assumes near-equilibrium - Requires sufficient cell numbers ### 3. Dynamical Model **Assumption**: Full kinetics without equilibrium constraint. **Inferred parameters**: α (transcription), β (splicing), γ (degradation), t_ (switching time) **Pros**: - Captures transient states - Infers absolute time - Most accurate for complex dynamics **Cons**: - Computationally intensive - Requires high-quality data - May overfit small datasets ## Comparison Table ```{r comparison_table, echo=FALSE} comparison_df <- data.frame( Feature = c("Speed", "Data requirement", "Parameters inferred", "Handles transient states", "Provides latent time", "Recommended dataset size", "Best for"), `Steady-State` = c("~1 min", "Low", "γ only", "No", "No", ">1,000 cells", "Exploration"), Stochastic = c("~5 min", "Medium", "γ only", "Partial", "No", ">3,000 cells", "Default choice"), Dynamical = c("~30 min", "High", "α, β, γ, t_", "Yes", "Yes", ">5,000 cells", "Publication") ) knitr::kable(comparison_df, col.names = c("Feature", "Steady-State", "Stochastic", "Dynamical")) ``` ## Visual Comparison ```{r model_comparison_visual, echo=FALSE, fig.cap="Velocity vectors from different models showing varying sensitivity to dynamics."} library(ggplot2) # Simulate data with transient dynamics set.seed(42) n <- 200 # True trajectory (circle with branch) theta <- seq(0, 1.8*pi, length.out = n) r <- 1 + 0.3 * sin(2 * theta) x <- r * cos(theta) + rnorm(n, 0, 0.08) y <- r * sin(theta) + rnorm(n, 0, 0.08) # True velocity (tangent) true_vx <- -r * sin(theta) * 0.1 true_vy <- r * cos(theta) * 0.1 # Model 1: Steady-state (captures general direction but misses nuances) ss_vx <- true_vx * 0.7 + rnorm(n, 0, 0.03) ss_vy <- true_vy * 0.7 + rnorm(n, 0, 0.03) # Model 2: Stochastic (better but still smoothed) stoch_vx <- true_vx * 0.85 + rnorm(n, 0, 0.02) stoch_vy <- true_vy * 0.85 + rnorm(n, 0, 0.02) # Model 3: Dynamical (most accurate) dyn_vx <- true_vx * 0.95 + rnorm(n, 0, 0.01) dyn_vy <- true_vy * 0.95 + rnorm(n, 0, 0.01) # Combine data df_models <- rbind( data.frame(x = x, y = y, vx = ss_vx, vy = ss_vy, model = "Steady-State"), data.frame(x = x, y = y, vx = stoch_vx, vy = stoch_vy, model = "Stochastic"), data.frame(x = x, y = y, vx = dyn_vx, vy = dyn_vy, model = "Dynamical") ) # Subsample for visualization arrow_idx <- sample(n, 60) df_arrows <- rbind( data.frame(x = x[arrow_idx], y = y[arrow_idx], vx = ss_vx[arrow_idx], vy = ss_vy[arrow_idx], model = "Steady-State"), data.frame(x = x[arrow_idx], y = y[arrow_idx], vx = stoch_vx[arrow_idx], vy = stoch_vy[arrow_idx], model = "Stochastic"), data.frame(x = x[arrow_idx], y = y[arrow_idx], vx = dyn_vx[arrow_idx], vy = dyn_vy[arrow_idx], model = "Dynamical") ) df_arrows$model <- factor(df_arrows$model, levels = c("Steady-State", "Stochastic", "Dynamical")) ggplot(df_arrows, aes(x = x, y = y)) + geom_point(color = "#2196F3", size = 1, alpha = 0.6) + geom_segment(aes(xend = x + vx, yend = y + vy), arrow = arrow(length = unit(0.1, "cm"), type = "closed"), linewidth = 0.4, color = "#F44336", alpha = 0.7) + facet_wrap(~model, ncol = 3) + labs(x = "UMAP_1", y = "UMAP_2", title = "Velocity Vectors by Model") + theme_minimal() + coord_fixed() + theme(strip.text = element_text(size = 12, face = "bold")) ``` ## When to Use Each Model ### Use Steady-State When: - **Quick exploration**: Getting a first look at your data - **Large datasets**: >50,000 cells where speed matters - **Equilibrium processes**: Homeostatic cell populations - **Limited compute resources** ```{r steady_state_usage, eval=FALSE} seurat_obj <- velocity(seurat_obj, mode = "deterministic") ``` ### Use Stochastic When: - **Default choice**: Good balance of speed and accuracy - **Noisy data**: High technical variability - **Transcriptional bursting**: Known bursty gene expression ```{r stochastic_usage, eval=FALSE} seurat_obj <- velocity(seurat_obj, mode = "stochastic") ``` ### Use Dynamical When: - **Publication-quality results**: Need the most accurate estimates - **Transient states**: Rapid differentiation, cell activation - **Latent time needed**: Want absolute temporal ordering - **Parameter interpretation**: Need kinetic rate estimates ```{r dynamical_usage, eval=FALSE} seurat_obj <- velocity(seurat_obj, mode = "dynamical", max_iter = 10) ``` ## Diagnostic Plots ### Phase Portrait Comparison ```{r phase_comparison, echo=FALSE, fig.cap="Phase portraits showing how each model fits the data."} # Simulate phase data set.seed(123) n <- 200 # True dynamics alpha <- 2 beta <- 0.5 gamma <- 0.3 t_switch <- 1.2 t <- runif(n, 0, 2.2) # Calculate u and s u <- ifelse(t < t_switch, alpha/beta * (1 - exp(-beta * t)), alpha/beta * (1 - exp(-beta * t_switch)) * exp(-beta * (t - t_switch))) s <- numeric(n) for (i in 1:n) { if (t[i] < t_switch) { s[i] <- alpha/gamma * (1 - exp(-gamma * t[i])) + alpha/(gamma - beta) * (exp(-gamma * t[i]) - exp(-beta * t[i])) } else { u_sw <- alpha/beta * (1 - exp(-beta * t_switch)) s_sw <- alpha/gamma * (1 - exp(-gamma * t_switch)) + alpha/(gamma - beta) * (exp(-gamma * t_switch) - exp(-beta * t_switch)) dt <- t[i] - t_switch s[i] <- s_sw * exp(-gamma * dt) + beta * u_sw / (gamma - beta) * (exp(-gamma * dt) - exp(-beta * dt)) } } # Add noise u_obs <- pmax(0, u + rnorm(n, 0, 0.15)) s_obs <- pmax(0, s + rnorm(n, 0, 0.15)) # Fitted lines # Steady-state: linear fit lm_fit <- lm(u_obs ~ s_obs) ss_slope <- coef(lm_fit)[2] # True trajectory for dynamical t_fit <- seq(0, 2.2, length.out = 200) u_fit <- ifelse(t_fit < t_switch, alpha/beta * (1 - exp(-beta * t_fit)), alpha/beta * (1 - exp(-beta * t_switch)) * exp(-beta * (t_fit - t_switch))) s_fit <- numeric(length(t_fit)) for (i in 1:length(t_fit)) { if (t_fit[i] < t_switch) { s_fit[i] <- alpha/gamma * (1 - exp(-gamma * t_fit[i])) + alpha/(gamma - beta) * (exp(-gamma * t_fit[i]) - exp(-beta * t_fit[i])) } else { u_sw <- alpha/beta * (1 - exp(-beta * t_switch)) s_sw <- alpha/gamma * (1 - exp(-gamma * t_switch)) + alpha/(gamma - beta) * (exp(-gamma * t_switch) - exp(-beta * t_switch)) dt <- t_fit[i] - t_switch s_fit[i] <- s_sw * exp(-gamma * dt) + beta * u_sw / (gamma - beta) * (exp(-gamma * dt) - exp(-beta * dt)) } } phase_df <- data.frame(s = s_obs, u = u_obs, t = t) fit_df <- data.frame(s = s_fit, u = u_fit) ggplot(phase_df, aes(x = s, y = u)) + geom_point(aes(color = t), size = 2, alpha = 0.7) + # Steady-state fit (linear) geom_abline(slope = ss_slope, intercept = coef(lm_fit)[1], color = "#E91E63", linewidth = 1, linetype = "dashed") + # Dynamical fit (curve) geom_path(data = fit_df, aes(x = s, y = u), color = "#4CAF50", linewidth = 1.2) + scale_color_viridis_c(option = "plasma") + labs(x = "Spliced", y = "Unspliced", title = "Phase Portrait: Model Comparison", color = "Time") + theme_minimal() + annotate("text", x = max(s_obs)*0.6, y = max(u_obs)*0.95, label = "— Steady-State (linear)", color = "#E91E63", hjust = 0) + annotate("text", x = max(s_obs)*0.6, y = max(u_obs)*0.85, label = "— Dynamical (curve)", color = "#4CAF50", hjust = 0) ``` ## R-squared Distribution ```{r r2_distribution, echo=FALSE, fig.cap="Distribution of R² values indicating model fit quality."} # Simulate R² distributions set.seed(42) n_genes <- 500 r2_ss <- rbeta(n_genes, 2, 5) * 0.6 r2_stoch <- rbeta(n_genes, 3, 4) * 0.7 r2_dyn <- rbeta(n_genes, 4, 3) * 0.8 r2_df <- rbind( data.frame(r2 = r2_ss, model = "Steady-State"), data.frame(r2 = r2_stoch, model = "Stochastic"), data.frame(r2 = r2_dyn, model = "Dynamical") ) r2_df$model <- factor(r2_df$model, levels = c("Steady-State", "Stochastic", "Dynamical")) ggplot(r2_df, aes(x = r2, fill = model)) + geom_density(alpha = 0.6) + scale_fill_manual(values = c("#E91E63", "#FF9800", "#4CAF50")) + labs(x = expression(R^2), y = "Density", title = expression(paste("Distribution of ", R^2, " Values by Model")), fill = "Model") + theme_minimal() + geom_vline(xintercept = 0.1, linetype = "dashed", color = "gray40") + annotate("text", x = 0.12, y = 3, label = "Threshold", hjust = 0, color = "gray40") ``` ## Computational Benchmarks ```{r benchmark, echo=FALSE, fig.cap="Computational time comparison across dataset sizes."} # Simulated benchmark data cells <- c(1000, 5000, 10000, 20000, 50000) time_ss <- c(5, 15, 30, 60, 150) time_stoch <- c(20, 90, 180, 360, 900) time_dyn <- c(120, 600, 1200, 2400, 6000) bench_df <- rbind( data.frame(cells = cells, time = time_ss, model = "Steady-State"), data.frame(cells = cells, time = time_stoch, model = "Stochastic"), data.frame(cells = cells, time = time_dyn, model = "Dynamical") ) ggplot(bench_df, aes(x = cells/1000, y = time/60, color = model)) + geom_line(linewidth = 1.2) + geom_point(size = 3) + scale_color_manual(values = c("Steady-State" = "#E91E63", "Stochastic" = "#FF9800", "Dynamical" = "#4CAF50")) + scale_y_log10() + labs(x = "Number of Cells (thousands)", y = "Time (minutes, log scale)", title = "Computational Time by Model", color = "Model") + theme_minimal() ``` ## Practical Workflow ### Recommended Approach ```{r workflow, eval=FALSE} library(scVeloR) # 1. Start with steady-state for quick exploration seurat_obj <- velocity(seurat_obj, mode = "deterministic") plot_velocity(seurat_obj) # 2. If results look promising, try stochastic seurat_obj <- velocity(seurat_obj, mode = "stochastic") plot_velocity(seurat_obj) # 3. For final analysis, use dynamical seurat_obj <- velocity(seurat_obj, mode = "dynamical") plot_velocity(seurat_obj) # 4. Compare results velocity_summary(seurat_obj) ``` ## Session Information ```{r session} sessionInfo() ```