Getting Started with CellOracleR

Introduction

CellOracleR enables in silico gene perturbation analysis of single-cell RNA-seq data. By combining gene regulatory network (GRN) inference with expression data, CellOracleR predicts cell state transitions in response to transcription factor perturbations.

Key Features

  • R6 Object-Oriented Design: Clean, intuitive API
  • Seurat Integration: Seamless workflow with Seurat V4/V5
  • High Performance: C++ acceleration for critical operations
  • Publication-Ready Visualizations: ggplot2-based plotting functions
  • Comprehensive Network Analysis: Centrality metrics, hub detection, and more

Workflow Overview

Installation

# Install from R-universe (recommended)
install.packages("CellOracleR", repos = "https://zaoqu-liu.r-universe.dev")

# Or install from GitHub
devtools::install_github("Zaoqu-Liu/CellOracleR")

# For motif analysis, install Bioconductor dependencies
BiocManager::install(c(
  "TFBSTools", "motifmatchr", 
  "BSgenome.Hsapiens.UCSC.hg38", "JASPAR2020"
))

Load Libraries

library(CellOracleR)
library(Seurat)
library(ggplot2)
library(future)

# Enable parallel processing (optional)
plan(multisession, workers = 4)

Step 1: Prepare Data

CellOracleR works with standard Seurat objects. The data should be normalized (log-transformed) but NOT scaled/centered.

# Load your Seurat object
seurat <- readRDS("path/to/seurat.rds")

# Check data structure
seurat

Creating Demo Data

For this tutorial, we’ll create a mock dataset:

set.seed(42)
n_cells <- 1000
n_genes <- 500

# Simulate expression with cluster structure
cluster_means <- list(
  c1 = c(rep(3, 50), rep(1, 450)),  # Cluster 1: high expression in first 50 genes
  c2 = c(rep(1, 50), rep(3, 100), rep(1, 350)),  # Cluster 2
  c3 = c(rep(1, 150), rep(3, 100), rep(1, 250))  # Cluster 3
)

# Create expression matrix
cells_per_cluster <- 333
expr_list <- list()
for (i in 1:3) {
  expr_list[[i]] <- matrix(
    rpois(cells_per_cluster * n_genes, lambda = cluster_means[[i]]),
    nrow = n_genes
  )
}
counts <- do.call(cbind, expr_list)
rownames(counts) <- paste0("Gene", 1:n_genes)
colnames(counts) <- paste0("Cell", 1:ncol(counts))

# Create Seurat object
seurat <- CreateSeuratObject(counts = counts)
seurat <- NormalizeData(seurat)
seurat <- FindVariableFeatures(seurat)
seurat <- ScaleData(seurat)
seurat <- RunPCA(seurat)
seurat <- FindNeighbors(seurat, dims = 1:30)
seurat <- FindClusters(seurat, resolution = 0.5)
seurat <- RunUMAP(seurat, dims = 1:30)

Step 2: Create Oracle Object

# Create Oracle from Seurat
oracle <- create_oracle(
  seurat = seurat,
  cluster_column = "seurat_clusters",
  embedding_name = "umap",
  verbose = TRUE
)

# Print summary
oracle

Output:

CellOracleR Oracle Object
  Cells: 1000
  Genes: 500
  Clusters: 3 (0, 1, 2)
  Embedding: UMAP (2D)
  GRN fitted: FALSE
  Simulation done: FALSE

Step 3: Import Base GRN

The base GRN defines which TFs can potentially regulate which genes. Options:

Option B: From Pre-built Database

# Load from CSV file
base_grn <- load_base_grn("tf_target_database.csv")

Option C: Custom Dictionary

# Define regulatory relationships manually
# List format: target_gene -> c(regulator1, regulator2, ...)
TFdict <- list(
  Gene1 = c("Gene10", "Gene20", "Gene30"),
  Gene2 = c("Gene10", "Gene15", "Gene25"),
  Gene3 = c("Gene20", "Gene30", "Gene40")
)

# For demo: create simple TF dictionary
TFdict <- lapply(paste0("Gene", 1:100), function(g) {
  sample(paste0("Gene", 101:200), size = sample(3:8, 1))
})
names(TFdict) <- paste0("Gene", 1:100)

Import to Oracle

oracle$import_TF_data(TFdict = TFdict)

Step 4: Preprocessing

# Use Seurat's PCA or compute new
oracle$perform_PCA(
  n_components = 50,
  use_seurat_pca = TRUE  # Use existing PCA from Seurat
)

# KNN imputation for noise reduction (optional but recommended)
oracle$knn_imputation(
  k = 30,           # Number of neighbors (NULL for auto)
  n_pca_dims = 50   # PCA dimensions for neighbor search
)

Step 5: Fit Cluster-Specific GRN

This is the core step - fitting regulatory coefficients using Ridge regression:

oracle$fit_grn_for_simulation(
  GRN_unit = "cluster",     # Fit separate GRN per cluster
  alpha = 10,               # Ridge regularization (1-100)
  bagging_number = 200,     # Bootstrap iterations
  verbose = TRUE
)

Key Parameters:

Parameter Description Recommended
GRN_unit “cluster” or “whole” “cluster”
alpha Regularization strength 10
bagging_number Bootstrap iterations 200

Step 6: Simulate Perturbation

Now we can simulate the effects of gene perturbations:

# Define perturbation: Gene10 knockout (expression = 0)
perturb_condition <- list(Gene10 = 0)

# Or use helper function
perturb_condition <- create_perturb_condition(
  genes = "Gene10",
  expression = 0
)

# Run simulation
oracle$simulate_shift(
  perturb_condition = perturb_condition,
  n_propagation = 3  # Signal propagation steps (1-5)
)

# View summary
simulation_summary(oracle)

Perturbation Types:

# Knockout (KO): Set expression to 0
perturb_ko <- list(Gene10 = 0)

# Overexpression (OE): Set to maximum observed value
max_expr <- max(oracle$adata$layers$imputed_count["Gene10", ])
perturb_oe <- list(Gene10 = max_expr)

# Multiple perturbations
perturb_multi <- list(
  Gene10 = 0,      # KO Gene10
  Gene20 = max_expr # OE Gene20
)

Step 7: Calculate Transition Probabilities

# Compute transition probabilities
oracle$estimate_transition_prob(
  k = 200,                   # KNN neighbors
  sampled_fraction = 1,      # Cell sampling fraction
  calculate_randomized = TRUE
)

# Calculate embedding shifts
oracle$calculate_embedding_shift()

# Calculate grid arrows for visualization
oracle$calculate_grid_arrows(
  n_grid = 40,
  n_neighbors = 100,
  smooth = 0.5
)

Step 8: Visualize Results

Cluster Overview

# Plot clusters
plot_cluster(oracle, title = "Cell Clusters")

Simulation Flow Field

The main visualization showing predicted cell state transitions:

# Main result visualization
plot_simulation_flow(
  oracle,
  scale = 30,
  min_mass = 0.01,
  arrow_color = "black",
  title = "Gene10 KO: Predicted Cell Transitions"
)

Gene Expression Changes

# Before perturbation
plot_gene_expression(oracle, "Gene10", title = "Gene10 Expression")

# Simulated change
plot_gene_expression(oracle, "Gene10", layer = "delta_X", 
                     title = "Gene10 Change (Delta)")

Step 9: Network Analysis

For detailed network characterization:

# Get Links object
links <- oracle$get_links(
  cluster_name_for_GRN_unit = "0",  # Cluster name
  alpha = 10,
  bagging_number = 200
)

# Filter by significance
links$filter(
  threshold_p = 0.001,
  threshold_coef = 0.1
)

# Calculate network metrics
links$get_network_score()

# View network summary
print(links)

Top Regulators

# Rank TFs by network metrics
plot_scores_as_rank(links, metric = "degree_out", top_n = 20)

Saving and Loading

# Save Oracle object (uses qs for fast serialization)
save_oracle(oracle, "my_analysis.qs")

# Load Oracle object
oracle <- load_oracle("my_analysis.qs")

Advanced: Comparing Multiple Perturbations

# List of genes to perturb
genes_to_test <- c("Gene10", "Gene20", "Gene30")

# Run systematic analysis
plots <- list()
for (gene in genes_to_test) {
  # Create copy to avoid overwriting
  oracle_test <- oracle$clone(deep = TRUE)
  
  # Simulate knockout
  oracle_test$simulate_shift(
    perturb_condition = list(gene = 0),
    n_propagation = 3
  )
  
  # Calculate transitions
  oracle_test$estimate_transition_prob()
  oracle_test$calculate_embedding_shift()
  oracle_test$calculate_grid_arrows()
  
  # Store plot
  plots[[gene]] <- plot_simulation_flow(oracle_test, title = paste(gene, "KO"))
}

# Combine plots
library(patchwork)
wrap_plots(plots, ncol = 2)

Summary

You’ve learned how to:

  1. ✅ Create an Oracle object from Seurat data
  2. ✅ Import regulatory network priors
  3. ✅ Fit cluster-specific GRNs
  4. ✅ Simulate gene perturbations
  5. ✅ Calculate transition probabilities
  6. ✅ Visualize predicted cell fate changes
  7. ✅ Perform network analysis

Next Steps

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] patchwork_1.3.2 Matrix_1.7-5    ggplot2_4.0.3   rmarkdown_2.31 
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6       jsonlite_2.0.0     dplyr_1.2.1        compiler_4.6.0    
#>  [5] tidyselect_1.2.1   jquerylib_0.1.4    splines_4.6.0      scales_1.4.0      
#>  [9] yaml_2.3.12        fastmap_1.2.0      lattice_0.22-9     R6_2.6.1          
#> [13] labeling_0.4.3     generics_0.1.4     knitr_1.51         tibble_3.3.1      
#> [17] maketools_1.3.2    bslib_0.11.0       pillar_1.11.1      RColorBrewer_1.1-3
#> [21] rlang_1.2.0        cachem_1.1.0       xfun_0.57          sass_0.4.10       
#> [25] sys_3.4.3          S7_0.2.2           otel_0.2.0         viridisLite_0.4.3 
#> [29] cli_3.6.6          mgcv_1.9-4         withr_3.0.2        magrittr_2.0.5    
#> [33] digest_0.6.39      grid_4.6.0         nlme_3.1-169       lifecycle_1.0.5   
#> [37] vctrs_0.7.3        evaluate_1.0.5     glue_1.8.1         farver_2.1.2      
#> [41] buildtools_1.0.0   tools_4.6.0        pkgconfig_2.0.3    htmltools_0.5.9