CellOracleR enables in silico gene perturbation analysis of single-cell RNA-seq data. By combining gene regulatory network (GRN) inference with expression data, CellOracleR predicts cell state transitions in response to transcription factor perturbations.
# Install from R-universe (recommended)
install.packages("CellOracleR", repos = "https://zaoqu-liu.r-universe.dev")
# Or install from GitHub
devtools::install_github("Zaoqu-Liu/CellOracleR")
# For motif analysis, install Bioconductor dependencies
BiocManager::install(c(
"TFBSTools", "motifmatchr",
"BSgenome.Hsapiens.UCSC.hg38", "JASPAR2020"
))CellOracleR works with standard Seurat objects. The data should be normalized (log-transformed) but NOT scaled/centered.
For this tutorial, we’ll create a mock dataset:
set.seed(42)
n_cells <- 1000
n_genes <- 500
# Simulate expression with cluster structure
cluster_means <- list(
c1 = c(rep(3, 50), rep(1, 450)), # Cluster 1: high expression in first 50 genes
c2 = c(rep(1, 50), rep(3, 100), rep(1, 350)), # Cluster 2
c3 = c(rep(1, 150), rep(3, 100), rep(1, 250)) # Cluster 3
)
# Create expression matrix
cells_per_cluster <- 333
expr_list <- list()
for (i in 1:3) {
expr_list[[i]] <- matrix(
rpois(cells_per_cluster * n_genes, lambda = cluster_means[[i]]),
nrow = n_genes
)
}
counts <- do.call(cbind, expr_list)
rownames(counts) <- paste0("Gene", 1:n_genes)
colnames(counts) <- paste0("Cell", 1:ncol(counts))
# Create Seurat object
seurat <- CreateSeuratObject(counts = counts)
seurat <- NormalizeData(seurat)
seurat <- FindVariableFeatures(seurat)
seurat <- ScaleData(seurat)
seurat <- RunPCA(seurat)
seurat <- FindNeighbors(seurat, dims = 1:30)
seurat <- FindClusters(seurat, resolution = 0.5)
seurat <- RunUMAP(seurat, dims = 1:30)# Create Oracle from Seurat
oracle <- create_oracle(
seurat = seurat,
cluster_column = "seurat_clusters",
embedding_name = "umap",
verbose = TRUE
)
# Print summary
oracleOutput:
CellOracleR Oracle Object
Cells: 1000
Genes: 500
Clusters: 3 (0, 1, 2)
Embedding: UMAP (2D)
GRN fitted: FALSE
Simulation done: FALSE
The base GRN defines which TFs can potentially regulate which genes. Options:
# Define regulatory relationships manually
# List format: target_gene -> c(regulator1, regulator2, ...)
TFdict <- list(
Gene1 = c("Gene10", "Gene20", "Gene30"),
Gene2 = c("Gene10", "Gene15", "Gene25"),
Gene3 = c("Gene20", "Gene30", "Gene40")
)
# For demo: create simple TF dictionary
TFdict <- lapply(paste0("Gene", 1:100), function(g) {
sample(paste0("Gene", 101:200), size = sample(3:8, 1))
})
names(TFdict) <- paste0("Gene", 1:100)# Use Seurat's PCA or compute new
oracle$perform_PCA(
n_components = 50,
use_seurat_pca = TRUE # Use existing PCA from Seurat
)
# KNN imputation for noise reduction (optional but recommended)
oracle$knn_imputation(
k = 30, # Number of neighbors (NULL for auto)
n_pca_dims = 50 # PCA dimensions for neighbor search
)This is the core step - fitting regulatory coefficients using Ridge regression:
oracle$fit_grn_for_simulation(
GRN_unit = "cluster", # Fit separate GRN per cluster
alpha = 10, # Ridge regularization (1-100)
bagging_number = 200, # Bootstrap iterations
verbose = TRUE
)Key Parameters:
| Parameter | Description | Recommended |
|---|---|---|
GRN_unit |
“cluster” or “whole” | “cluster” |
alpha |
Regularization strength | 10 |
bagging_number |
Bootstrap iterations | 200 |
Now we can simulate the effects of gene perturbations:
# Define perturbation: Gene10 knockout (expression = 0)
perturb_condition <- list(Gene10 = 0)
# Or use helper function
perturb_condition <- create_perturb_condition(
genes = "Gene10",
expression = 0
)
# Run simulation
oracle$simulate_shift(
perturb_condition = perturb_condition,
n_propagation = 3 # Signal propagation steps (1-5)
)
# View summary
simulation_summary(oracle)Perturbation Types:
# Knockout (KO): Set expression to 0
perturb_ko <- list(Gene10 = 0)
# Overexpression (OE): Set to maximum observed value
max_expr <- max(oracle$adata$layers$imputed_count["Gene10", ])
perturb_oe <- list(Gene10 = max_expr)
# Multiple perturbations
perturb_multi <- list(
Gene10 = 0, # KO Gene10
Gene20 = max_expr # OE Gene20
)# Compute transition probabilities
oracle$estimate_transition_prob(
k = 200, # KNN neighbors
sampled_fraction = 1, # Cell sampling fraction
calculate_randomized = TRUE
)
# Calculate embedding shifts
oracle$calculate_embedding_shift()
# Calculate grid arrows for visualization
oracle$calculate_grid_arrows(
n_grid = 40,
n_neighbors = 100,
smooth = 0.5
)The main visualization showing predicted cell state transitions:
For detailed network characterization:
# Get Links object
links <- oracle$get_links(
cluster_name_for_GRN_unit = "0", # Cluster name
alpha = 10,
bagging_number = 200
)
# Filter by significance
links$filter(
threshold_p = 0.001,
threshold_coef = 0.1
)
# Calculate network metrics
links$get_network_score()
# View network summary
print(links)# List of genes to perturb
genes_to_test <- c("Gene10", "Gene20", "Gene30")
# Run systematic analysis
plots <- list()
for (gene in genes_to_test) {
# Create copy to avoid overwriting
oracle_test <- oracle$clone(deep = TRUE)
# Simulate knockout
oracle_test$simulate_shift(
perturb_condition = list(gene = 0),
n_propagation = 3
)
# Calculate transitions
oracle_test$estimate_transition_prob()
oracle_test$calculate_embedding_shift()
oracle_test$calculate_grid_arrows()
# Store plot
plots[[gene]] <- plot_simulation_flow(oracle_test, title = paste(gene, "KO"))
}
# Combine plots
library(patchwork)
wrap_plots(plots, ncol = 2)You’ve learned how to:
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] patchwork_1.3.2 Matrix_1.7-5 ggplot2_4.0.3 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 dplyr_1.2.1 compiler_4.6.0
#> [5] tidyselect_1.2.1 jquerylib_0.1.4 splines_4.6.0 scales_1.4.0
#> [9] yaml_2.3.12 fastmap_1.2.0 lattice_0.22-9 R6_2.6.1
#> [13] labeling_0.4.3 generics_0.1.4 knitr_1.51 tibble_3.3.1
#> [17] maketools_1.3.2 bslib_0.11.0 pillar_1.11.1 RColorBrewer_1.1-3
#> [21] rlang_1.2.0 cachem_1.1.0 xfun_0.57 sass_0.4.10
#> [25] sys_3.4.3 S7_0.2.2 otel_0.2.0 viridisLite_0.4.3
#> [29] cli_3.6.6 mgcv_1.9-4 withr_3.0.2 magrittr_2.0.5
#> [33] digest_0.6.39 grid_4.6.0 nlme_3.1-169 lifecycle_1.0.5
#> [37] vctrs_0.7.3 evaluate_1.0.5 glue_1.8.1 farver_2.1.2
#> [41] buildtools_1.0.0 tools_4.6.0 pkgconfig_2.0.3 htmltools_0.5.9