--- title: "Building Custom References" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Building Custom References} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, fig.align = "center" ) ``` ## Introduction While CellProgramMapper provides pre-built references for common cell types, you may want to create custom references for: - Novel cell types or tissues - Specific biological contexts - Combining multiple datasets - Custom gene expression programs This vignette covers the process of building and using custom references. ## Reference Format A reference consists of a **spectra matrix** $\mathbf{H}$ where: - Rows: Gene expression programs (GEPs) - Columns: Genes - Values: Non-negative weights representing gene contributions ### File Format CellProgramMapper accepts references in tab-separated format (TSV): ``` Gene GEP1 GEP2 GEP3 CD3D 0.023 0.001 0.012 CD8A 0.045 0.002 0.008 ... ``` Or as an R matrix/data.frame object. ## Method 1: From cNMF Results ### Running cNMF First, run cNMF (consensus Non-negative Matrix Factorization) on your reference data using Python: ```python # In Python from cnmf import cNMF import scanpy as sc # Load data adata = sc.read_h5ad("reference_data.h5ad") # Initialize cNMF cnmf_obj = cNMF(output_dir='./cnmf_output', name='my_reference') # Prepare data cnmf_obj.prepare(counts_fn=adata, components=np.arange(5, 25), n_iter=200, seed=42) # Run factorization cnmf_obj.factorize(worker_i=0, total_workers=1) # Compute consensus cnmf_obj.consensus(k=15, density_threshold=0.1) ``` ### Loading cNMF Results ```{r load-cnmf, eval=FALSE} library(CellProgramMapper) # Load from cNMF output directory spectra <- load_cnmf_spectra( cnmf_dir = "./cnmf_output", cnmf_name = "my_reference", k = 15 # Number of programs ) # Use as reference result <- CellProgramMapper( query = seurat_obj, reference = spectra ) ``` ## Method 2: Building Consensus Reference When you have cNMF results from multiple datasets, use `BuildConsensusReference` to create a unified reference: ```{r consensus, eval=FALSE} # Initialize builder builder <- BuildConsensusReference( output_dir = "./consensus_output", name = "my_consensus_reference" ) # Add cNMF results from multiple datasets builder$add_cnmf_result( cnmf_dir = "./dataset1/cnmf_output", cnmf_name = "dataset1", k = 15 ) builder$add_cnmf_result( cnmf_dir = "./dataset2/cnmf_output", cnmf_name = "dataset2", k = 12 ) # Compute correlations between all GEPs builder$compute_gep_correlations() # Cluster GEPs into consensus programs builder$cluster_geps( correlation_threshold = 0.5, min_cluster_size = 2 ) # Get final consensus spectra consensus_spectra <- builder$get_consensus_spectra() ``` ## Method 3: Manual Construction ### From Known Gene Signatures ```{r manual-sig, eval=TRUE} library(CellProgramMapper) # Define gene signatures signatures <- list( Exhaustion = c("PDCD1", "LAG3", "HAVCR2", "TIGIT", "CTLA4"), Cytotoxicity = c("GZMA", "GZMB", "PRF1", "GNLY", "NKG7"), Proliferation = c("MKI67", "TOP2A", "PCNA", "CDK1", "CCNB1"), Memory = c("IL7R", "TCF7", "LEF1", "CCR7", "SELL") ) # Create binary spectra all_genes <- unique(unlist(signatures)) spectra <- matrix(0, nrow = length(signatures), ncol = length(all_genes)) rownames(spectra) <- names(signatures) colnames(spectra) <- all_genes for (i in seq_along(signatures)) { spectra[i, signatures[[i]]] <- 1 } # View spectra print(spectra) ``` ### From Expression Data ```{r manual-expr, eval=TRUE} # Simulate expression data for demonstration set.seed(42) n_cells <- 200 n_genes <- 50 # Create expression matrix expression <- matrix(rpois(n_cells * n_genes, lambda = 5), nrow = n_cells, ncol = n_genes) colnames(expression) <- paste0("Gene", 1:n_genes) rownames(expression) <- paste0("Cell", 1:n_cells) # Define cell type labels cell_types <- rep(c("TypeA", "TypeB", "TypeC", "TypeD"), each = 50) # Compute mean expression per cell type spectra <- do.call(rbind, lapply(unique(cell_types), function(ct) { cells <- which(cell_types == ct) colMeans(expression[cells, ]) })) rownames(spectra) <- unique(cell_types) # Ensure non-negativity spectra[spectra < 0] <- 0 print(dim(spectra)) ``` ## Validating Custom References ### Quality Checks ```{r validate, eval=TRUE} validate_reference <- function(spectra) { checks <- list() # Check 1: Non-negativity checks$non_negative <- all(spectra >= 0) # Check 2: No all-zero programs checks$no_zero_programs <- all(rowSums(spectra) > 0) # Check 3: No all-zero genes checks$no_zero_genes <- all(colSums(spectra) > 0) # Check 4: Reasonable number of programs checks$reasonable_k <- nrow(spectra) >= 2 && nrow(spectra) <= 100 # Check 5: Sufficient genes checks$sufficient_genes <- ncol(spectra) >= 100 # Report cat("Reference validation:\n") for (check_name in names(checks)) { status <- if (checks[[check_name]]) "PASS" else "FAIL" cat(sprintf(" %s: %s\n", check_name, status)) } return(all(unlist(checks))) } # Note: Our demo spectra has fewer genes than recommended # validate_reference(spectra) ``` ### Visualize Reference ```{r viz-ref, eval=TRUE, fig.cap="Visualization of reference spectra"} # Heatmap of spectra if (!requireNamespace("pheatmap", quietly = TRUE)) { install.packages("pheatmap") } # For demonstration, use the signature-based spectra library(pheatmap) pheatmap( spectra, cluster_rows = TRUE, cluster_cols = TRUE, show_colnames = TRUE, main = "Custom Reference Spectra", color = colorRampPalette(c("white", "#08306b"))(100) ) ``` ## Saving References ### As TSV File ```r # Save spectra matrix write.table(spectra, file = "my_reference.tsv", sep = "\t", quote = FALSE, row.names = TRUE) ``` ### With Metadata ```r # Save with additional information reference_data <- list( spectra = spectra, metadata = list( name = "My Custom Reference", version = "1.0", description = "Reference for XYZ cell types", species = "Homo sapiens", source_datasets = c("Dataset1", "Dataset2"), date_created = Sys.Date() ) ) saveRDS(reference_data, "my_reference.rds") ``` ## Using Custom References ```{r use-custom, eval=FALSE} # Method 1: From file path result <- CellProgramMapper( query = seurat_obj, reference = "path/to/my_reference.tsv" ) # Method 2: From matrix object result <- CellProgramMapper( query = seurat_obj, reference = spectra # Your spectra matrix ) # Method 3: From RDS file ref_data <- readRDS("my_reference.rds") result <- CellProgramMapper( query = seurat_obj, reference = ref_data$spectra ) ``` ## Best Practices ### Reference Construction 1. **Use diverse data**: Include cells from multiple donors/conditions 2. **Quality control**: Remove low-quality cells before NMF 3. **Appropriate k**: Choose number of programs based on biological complexity 4. **Gene selection**: Use highly variable genes (1000-5000) ### Gene Overlap ```{r gene-overlap, eval=TRUE, fig.cap="Checking gene overlap between query and reference"} # Simulate query genes query_genes <- c(paste0("Gene", 1:40), paste0("OtherGene", 1:20)) ref_genes <- colnames(spectra) # Compute overlap overlap <- intersect(query_genes, ref_genes) query_only <- setdiff(query_genes, ref_genes) ref_only <- setdiff(ref_genes, query_genes) cat(sprintf("Query genes: %d\n", length(query_genes))) cat(sprintf("Reference genes: %d\n", length(ref_genes))) cat(sprintf("Overlap: %d (%.1f%%)\n", length(overlap), 100 * length(overlap) / length(ref_genes))) # Visualize par(mar = c(2, 2, 2, 2)) venn_counts <- c( "Query only" = length(query_only), "Reference only" = length(ref_only), "Overlap" = length(overlap) ) barplot(venn_counts, col = c("#e41a1c", "#377eb8", "#4daf4a"), main = "Gene Overlap", ylab = "Number of genes") ``` ### Recommendations | Aspect | Recommendation | |--------|----------------| | Gene overlap | >80% of reference genes | | Number of programs | 5-30 for most applications | | Reference size | >1000 cells recommended | | Data normalization | Use same method for ref and query | ## Troubleshooting ### Low Gene Overlap ```r # Check overlap ref_genes <- colnames(spectra) query_genes <- rownames(GetAssayData(seurat_obj)) overlap <- intersect(query_genes, ref_genes) if (length(overlap) / length(ref_genes) < 0.5) { warning("Low gene overlap - check gene naming conventions") # Try converting gene symbols # e.g., from ENSEMBL to symbol, or uppercase/lowercase } ``` ### All-Zero Usage If all usage values are zero: 1. Check gene overlap (see above) 2. Verify data is non-negative 3. Check for extreme sparsity ### High Reconstruction Error ```r # Compute reconstruction error reconstruct <- usage %*% spectra[, overlap] original <- query_data[, overlap] error <- mean((original - reconstruct)^2) if (error > threshold) { # Consider: # 1. Increasing number of programs # 2. Using tissue-specific reference # 3. Checking data quality } ``` ## Session Info ```{r session} sessionInfo() ```