--- title: "Getting Started with CellOracleR" subtitle: "Complete Workflow for In Silico Gene Perturbation Analysis" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 fig_width: 8 fig_height: 6 vignette: > %\VignetteIndexEntry{Getting Started with CellOracleR} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, fig.align = "center", message = FALSE, warning = FALSE, eval = FALSE ) ``` ## Introduction CellOracleR enables **in silico gene perturbation analysis** of single-cell RNA-seq data. By combining gene regulatory network (GRN) inference with expression data, CellOracleR predicts cell state transitions in response to transcription factor perturbations. ### Key Features - **R6 Object-Oriented Design**: Clean, intuitive API - **Seurat Integration**: Seamless workflow with Seurat V4/V5 - **High Performance**: C++ acceleration for critical operations - **Publication-Ready Visualizations**: ggplot2-based plotting functions - **Comprehensive Network Analysis**: Centrality metrics, hub detection, and more ### Workflow Overview ```{r workflow-diagram, echo=FALSE, eval=TRUE, fig.height=3, fig.width=10} library(ggplot2) steps <- data.frame( x = 1:6, label = c("Seurat\nObject", "Import\nBase GRN", "Fit\nGRN", "Simulate\nPerturbation", "Transition\nProbability", "Visualize") ) ggplot(steps, aes(x = x, y = 0)) + geom_rect(aes(xmin = x - 0.4, xmax = x + 0.4, ymin = -0.3, ymax = 0.3), fill = c("#E3F2FD", "#E3F2FD", "#FFF3E0", "#FFF3E0", "#E8F5E9", "#E8F5E9"), color = "black") + geom_text(aes(label = label), size = 3.5, fontface = "bold") + geom_segment(data = data.frame(x = 1:5 + 0.4, xend = 2:6 - 0.4), aes(x = x, xend = xend, y = 0, yend = 0), arrow = arrow(length = unit(0.2, "cm"))) + theme_void() + labs(title = "CellOracleR Analysis Workflow") + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14)) ``` ## Installation ```{r install} # Install from R-universe (recommended) install.packages("CellOracleR", repos = "https://zaoqu-liu.r-universe.dev") # Or install from GitHub devtools::install_github("Zaoqu-Liu/CellOracleR") # For motif analysis, install Bioconductor dependencies BiocManager::install(c( "TFBSTools", "motifmatchr", "BSgenome.Hsapiens.UCSC.hg38", "JASPAR2020" )) ``` ## Load Libraries ```{r libraries} library(CellOracleR) library(Seurat) library(ggplot2) library(future) # Enable parallel processing (optional) plan(multisession, workers = 4) ``` ## Step 1: Prepare Data CellOracleR works with standard **Seurat objects**. The data should be normalized (log-transformed) but **NOT** scaled/centered. ```{r prepare_seurat} # Load your Seurat object seurat <- readRDS("path/to/seurat.rds") # Check data structure seurat ``` ### Creating Demo Data For this tutorial, we'll create a mock dataset: ```{r create_demo} set.seed(42) n_cells <- 1000 n_genes <- 500 # Simulate expression with cluster structure cluster_means <- list( c1 = c(rep(3, 50), rep(1, 450)), # Cluster 1: high expression in first 50 genes c2 = c(rep(1, 50), rep(3, 100), rep(1, 350)), # Cluster 2 c3 = c(rep(1, 150), rep(3, 100), rep(1, 250)) # Cluster 3 ) # Create expression matrix cells_per_cluster <- 333 expr_list <- list() for (i in 1:3) { expr_list[[i]] <- matrix( rpois(cells_per_cluster * n_genes, lambda = cluster_means[[i]]), nrow = n_genes ) } counts <- do.call(cbind, expr_list) rownames(counts) <- paste0("Gene", 1:n_genes) colnames(counts) <- paste0("Cell", 1:ncol(counts)) # Create Seurat object seurat <- CreateSeuratObject(counts = counts) seurat <- NormalizeData(seurat) seurat <- FindVariableFeatures(seurat) seurat <- ScaleData(seurat) seurat <- RunPCA(seurat) seurat <- FindNeighbors(seurat, dims = 1:30) seurat <- FindClusters(seurat, resolution = 0.5) seurat <- RunUMAP(seurat, dims = 1:30) ``` ## Step 2: Create Oracle Object ```{r create_oracle} # Create Oracle from Seurat oracle <- create_oracle( seurat = seurat, cluster_column = "seurat_clusters", embedding_name = "umap", verbose = TRUE ) # Print summary oracle ``` **Output:** ``` CellOracleR Oracle Object Cells: 1000 Genes: 500 Clusters: 3 (0, 1, 2) Embedding: UMAP (2D) GRN fitted: FALSE Simulation done: FALSE ``` ## Step 3: Import Base GRN The **base GRN** defines which TFs can potentially regulate which genes. Options: ### Option A: From Motif Analysis (Recommended) ```{r motif_grn} # Requires scATAC-seq peak data base_grn <- create_base_grn( peaks_df = peaks_df, # data.frame with chr, start, end, gene_short_name ref_genome = "hg38", upstream = 2000, downstream = 2000, fpr = 0.02, n_cores = 4 ) ``` ### Option B: From Pre-built Database ```{r load_grn} # Load from CSV file base_grn <- load_base_grn("tf_target_database.csv") ``` ### Option C: Custom Dictionary ```{r custom_grn} # Define regulatory relationships manually # List format: target_gene -> c(regulator1, regulator2, ...) TFdict <- list( Gene1 = c("Gene10", "Gene20", "Gene30"), Gene2 = c("Gene10", "Gene15", "Gene25"), Gene3 = c("Gene20", "Gene30", "Gene40") ) # For demo: create simple TF dictionary TFdict <- lapply(paste0("Gene", 1:100), function(g) { sample(paste0("Gene", 101:200), size = sample(3:8, 1)) }) names(TFdict) <- paste0("Gene", 1:100) ``` ### Import to Oracle ```{r import_tf} oracle$import_TF_data(TFdict = TFdict) ``` ## Step 4: Preprocessing ```{r preprocess} # Use Seurat's PCA or compute new oracle$perform_PCA( n_components = 50, use_seurat_pca = TRUE # Use existing PCA from Seurat ) # KNN imputation for noise reduction (optional but recommended) oracle$knn_imputation( k = 30, # Number of neighbors (NULL for auto) n_pca_dims = 50 # PCA dimensions for neighbor search ) ``` ## Step 5: Fit Cluster-Specific GRN This is the core step - fitting regulatory coefficients using Ridge regression: ```{r fit_grn} oracle$fit_grn_for_simulation( GRN_unit = "cluster", # Fit separate GRN per cluster alpha = 10, # Ridge regularization (1-100) bagging_number = 200, # Bootstrap iterations verbose = TRUE ) ``` **Key Parameters:** | Parameter | Description | Recommended | |-----------|-------------|-------------| | `GRN_unit` | "cluster" or "whole" | "cluster" | | `alpha` | Regularization strength | 10 | | `bagging_number` | Bootstrap iterations | 200 | ## Step 6: Simulate Perturbation Now we can simulate the effects of gene perturbations: ```{r simulate} # Define perturbation: Gene10 knockout (expression = 0) perturb_condition <- list(Gene10 = 0) # Or use helper function perturb_condition <- create_perturb_condition( genes = "Gene10", expression = 0 ) # Run simulation oracle$simulate_shift( perturb_condition = perturb_condition, n_propagation = 3 # Signal propagation steps (1-5) ) # View summary simulation_summary(oracle) ``` **Perturbation Types:** ```{r perturb_types} # Knockout (KO): Set expression to 0 perturb_ko <- list(Gene10 = 0) # Overexpression (OE): Set to maximum observed value max_expr <- max(oracle$adata$layers$imputed_count["Gene10", ]) perturb_oe <- list(Gene10 = max_expr) # Multiple perturbations perturb_multi <- list( Gene10 = 0, # KO Gene10 Gene20 = max_expr # OE Gene20 ) ``` ## Step 7: Calculate Transition Probabilities ```{r transition} # Compute transition probabilities oracle$estimate_transition_prob( k = 200, # KNN neighbors sampled_fraction = 1, # Cell sampling fraction calculate_randomized = TRUE ) # Calculate embedding shifts oracle$calculate_embedding_shift() # Calculate grid arrows for visualization oracle$calculate_grid_arrows( n_grid = 40, n_neighbors = 100, smooth = 0.5 ) ``` ## Step 8: Visualize Results ### Cluster Overview ```{r plot_cluster, eval=TRUE, echo=FALSE} # Demo visualization library(ggplot2) set.seed(42) demo_plot_data <- data.frame( UMAP_1 = c(rnorm(333, -2, 0.8), rnorm(334, 2, 0.8), rnorm(333, 0, 0.8)), UMAP_2 = c(rnorm(333, 0, 0.8), rnorm(334, 2, 0.8), rnorm(333, -2, 0.8)), cluster = factor(rep(0:2, c(333, 334, 333))) ) ggplot(demo_plot_data, aes(x = UMAP_1, y = UMAP_2, color = cluster)) + geom_point(alpha = 0.6, size = 1) + scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) + labs(title = "Cell Clusters", color = "Cluster") + theme_minimal() + theme(panel.grid = element_blank()) + coord_fixed() ``` ```{r plot_cluster_code} # Plot clusters plot_cluster(oracle, title = "Cell Clusters") ``` ### Simulation Flow Field The main visualization showing predicted cell state transitions: ```{r plot_flow, eval=TRUE, echo=FALSE} # Demo flow visualization grid_x <- seq(-4, 4, by = 0.5) grid_y <- seq(-4, 4, by = 0.5) grid <- expand.grid(x = grid_x, y = grid_y) # Simulate flow toward cluster 1 target <- c(2, 2) grid$dx <- (target[1] - grid$x) / sqrt((target[1] - grid$x)^2 + (target[2] - grid$y)^2 + 0.1) * 0.3 grid$dy <- (target[2] - grid$y) / sqrt((target[1] - grid$x)^2 + (target[2] - grid$y)^2 + 0.1) * 0.3 ggplot() + geom_point(data = demo_plot_data, aes(x = UMAP_1, y = UMAP_2, color = cluster), alpha = 0.3, size = 0.8) + geom_segment(data = grid, aes(x = x, y = y, xend = x + dx, yend = y + dy), arrow = arrow(length = unit(0.1, "cm")), color = "black", alpha = 0.7) + scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) + labs(title = "Gene10 Knockout: Predicted Flow", subtitle = "Arrows show predicted cell movement direction") + theme_minimal() + theme(panel.grid = element_blank()) + coord_fixed() ``` ```{r plot_flow_code} # Main result visualization plot_simulation_flow( oracle, scale = 30, min_mass = 0.01, arrow_color = "black", title = "Gene10 KO: Predicted Cell Transitions" ) ``` ### Gene Expression Changes ```{r plot_expression, eval=TRUE, echo=FALSE} # Demo expression plot demo_plot_data$expression <- c( rnorm(333, 3, 0.5), rnorm(334, 1, 0.3), rnorm(333, 1, 0.3) ) ggplot(demo_plot_data, aes(x = UMAP_1, y = UMAP_2, color = expression)) + geom_point(alpha = 0.7, size = 1) + scale_color_viridis_c(option = "plasma") + labs(title = "Gene10 Expression", color = "Expression") + theme_minimal() + theme(panel.grid = element_blank()) + coord_fixed() ``` ```{r plot_expression_code} # Before perturbation plot_gene_expression(oracle, "Gene10", title = "Gene10 Expression") # Simulated change plot_gene_expression(oracle, "Gene10", layer = "delta_X", title = "Gene10 Change (Delta)") ``` ## Step 9: Network Analysis For detailed network characterization: ```{r network} # Get Links object links <- oracle$get_links( cluster_name_for_GRN_unit = "0", # Cluster name alpha = 10, bagging_number = 200 ) # Filter by significance links$filter( threshold_p = 0.001, threshold_coef = 0.1 ) # Calculate network metrics links$get_network_score() # View network summary print(links) ``` ### Top Regulators ```{r top_regulators, eval=TRUE, echo=FALSE} # Demo ranking plot top_genes <- data.frame( gene = paste0("Gene", c(105, 123, 142, 167, 189, 112, 134, 156, 178, 190)), degree = c(25, 22, 20, 18, 17, 16, 15, 14, 13, 12) ) top_genes$gene <- factor(top_genes$gene, levels = rev(top_genes$gene)) ggplot(top_genes, aes(x = degree, y = gene)) + geom_col(fill = "#2196F3", color = "black") + labs(title = "Top Regulators by Out-Degree", x = "Out-Degree (Number of Targets)", y = "Transcription Factor") + theme_bw() + theme(plot.title = element_text(face = "bold")) ``` ```{r top_regulators_code} # Rank TFs by network metrics plot_scores_as_rank(links, metric = "degree_out", top_n = 20) ``` ## Saving and Loading ```{r save_load} # Save Oracle object (uses qs for fast serialization) save_oracle(oracle, "my_analysis.qs") # Load Oracle object oracle <- load_oracle("my_analysis.qs") ``` ## Advanced: Comparing Multiple Perturbations ```{r compare} # List of genes to perturb genes_to_test <- c("Gene10", "Gene20", "Gene30") # Run systematic analysis plots <- list() for (gene in genes_to_test) { # Create copy to avoid overwriting oracle_test <- oracle$clone(deep = TRUE) # Simulate knockout oracle_test$simulate_shift( perturb_condition = list(gene = 0), n_propagation = 3 ) # Calculate transitions oracle_test$estimate_transition_prob() oracle_test$calculate_embedding_shift() oracle_test$calculate_grid_arrows() # Store plot plots[[gene]] <- plot_simulation_flow(oracle_test, title = paste(gene, "KO")) } # Combine plots library(patchwork) wrap_plots(plots, ncol = 2) ``` ## Summary You've learned how to: 1. ✅ Create an Oracle object from Seurat data 2. ✅ Import regulatory network priors 3. ✅ Fit cluster-specific GRNs 4. ✅ Simulate gene perturbations 5. ✅ Calculate transition probabilities 6. ✅ Visualize predicted cell fate changes 7. ✅ Perform network analysis ## Next Steps - Read the [Algorithm Principles](algorithm-principles.html) vignette for mathematical details - Explore the [Visualization Gallery](visualization-gallery.html) for more plot types - See the [Case Study](case-study.html) for a complete biological example ## Session Info ```{r session, eval=TRUE} sessionInfo() ```