--- title: "Advanced Usage" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 fig_caption: true vignette: > %\VignetteIndexEntry{Advanced Usage} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ``` ## Introduction This vignette covers advanced features of darwin for power users, including custom objective functions, fixed gene count mode, parallel computing, and integration with other tools. ```{r load} library(darwin) library(ggplot2) set.seed(42) ``` ## Prepare Data ```{r data} # Create reference matrix n_ct <- 5 n_genes <- 400 reference <- matrix(abs(rnorm(n_ct * n_genes, 2, 1)), nrow = n_ct, ncol = n_genes) rownames(reference) <- paste0("CellType", 1:n_ct) colnames(reference) <- paste0("Gene", 1:n_genes) # Add markers for (i in 1:n_ct) { reference[i, ((i-1)*20+1):(i*20)] <- reference[i, ((i-1)*20+1):(i*20)] + 4 } ``` ## Custom Objective Functions darwin allows you to define custom objective functions for specialized applications. ### Requirements A valid objective function must: 1. Accept a single argument: data matrix (cell types × genes) 2. Return a single numeric value 3. Be consistent (same input → same output) ### Example: Variance Objective ```{r custom-variance} # Custom objective: maximize total variance variance_objective <- function(data) { sum(apply(data, 2, var)) } # Test the function test_data <- reference[, 1:50] cat("Variance objective value:", variance_objective(test_data), "\n") ``` ### Example: Marker Score Objective ```{r custom-marker} # Custom objective: maximize marker specificity # Higher when genes are specific to individual cell types marker_score <- function(data) { # Max expression / mean expression ratio col_max <- apply(data, 2, max) col_mean <- colMeans(data) col_mean[col_mean == 0] <- 1e-10 sum(col_max / col_mean) } cat("Marker score value:", marker_score(test_data), "\n") ``` ### Using Custom Objectives ```{r use-custom, fig.cap="Pareto front with custom objectives: correlation vs variance."} dw_custom <- darwin(reference) dw_custom$optimize( ngen = 60, objectives = c("correlation", variance_objective), weights = c(-1, 1), # Minimize correlation, maximize variance verbose = FALSE, parallel = FALSE ) # The second objective column will show variance values fitness <- dw_custom$get_fitness() colnames(fitness) <- c("Correlation", "Variance") ggplot(as.data.frame(fitness), aes(x = Correlation, y = Variance)) + geom_point(color = "#3498db", size = 3, alpha = 0.7) + geom_line(color = "gray60", alpha = 0.5) + labs( title = "Custom Objectives: Correlation vs Variance", subtitle = "Minimize correlation, maximize variance", x = "Correlation (lower is better)", y = "Variance (higher is better)" ) + theme_minimal(base_size = 12) ``` ### Three Objectives ```{r three-obj} dw_three <- darwin(reference) dw_three$optimize( ngen = 60, objectives = c("correlation", "distance", "condition"), weights = c(-1, 1, -1), # Minimize corr/cond, maximize distance verbose = FALSE, parallel = FALSE ) fitness3 <- dw_three$get_fitness() cat("Three-objective optimization:\n") cat(" Solutions:", nrow(fitness3), "\n") cat(" Correlation range:", round(range(fitness3[,1]), 2), "\n") cat(" Distance range:", round(range(fitness3[,2]), 2), "\n") cat(" Condition range:", round(range(fitness3[,3]), 2), "\n") ``` ## Fixed Gene Count Mode For applications requiring a specific number of marker genes, use fixed mode: ```{r fixed-mode, fig.cap="Fixed mode ensures all solutions have exactly the specified number of genes."} dw_fixed <- darwin(reference) dw_fixed$optimize( ngen = 60, mode = "fixed", n_features = 50, # Select exactly 50 genes objectives = c("correlation", "distance"), weights = c(-1, 1), verbose = FALSE, parallel = FALSE ) # Verify all solutions have 50 genes pareto <- dw_fixed$get_pareto() gene_counts <- sapply(pareto, sum) cat("Gene counts in fixed mode:", unique(gene_counts), "\n") # Compare gene count distributions df_compare <- rbind( data.frame( mode = "Standard", n_genes = sapply(dw_custom$get_pareto(), sum) ), data.frame( mode = "Fixed (50)", n_genes = gene_counts ) ) ggplot(df_compare, aes(x = n_genes, fill = mode)) + geom_histogram(bins = 20, alpha = 0.7, position = "identity") + scale_fill_manual(values = c("#3498db", "#e74c3c")) + labs( title = "Gene Count Distribution: Standard vs Fixed Mode", x = "Number of Selected Genes", y = "Frequency" ) + theme_minimal(base_size = 12) ``` ## Selection Strategies darwin provides flexible selection from the Pareto front: ### 1. Weighted Selection ```{r select-weighted} dw <- darwin(reference) dw$optimize(ngen = 50, verbose = FALSE, parallel = FALSE) # Emphasize minimizing correlation dw$select(weights = c(-2, 1)) cat("Emphasize correlation - genes:", sum(dw$get_selection()), "\n") # Emphasize maximizing distance dw$select(weights = c(-1, 2)) cat("Emphasize distance - genes:", sum(dw$get_selection()), "\n") ``` ### 2. Index-Based Selection ```{r select-index} # Select by direct index dw$select(index = 1) cat("Solution 1 - genes:", sum(dw$get_selection()), "\n") # Select by objective rank # (objective_index, rank) - rank 1 = best for that objective dw$select(index = c(1, 1)) # Best correlation cat("Best correlation - genes:", sum(dw$get_selection()), "\n") dw$select(index = c(2, -1)) # Best distance (last rank) cat("Best distance - genes:", sum(dw$get_selection()), "\n") ``` ### 3. Target Value Selection ```{r select-target} # Select solution closest to target correlation value fitness <- dw$get_fitness() target_corr <- median(fitness$correlation) dw$select(close_to = c(1, target_corr)) cat("Target correlation", round(target_corr, 2), "- genes:", sum(dw$get_selection()), "\n") ``` ## Parallel Computing darwin supports parallel computation for faster optimization: ```{r parallel, eval=FALSE} # Enable parallel computing options(darwin.parallel = TRUE) # Or specify in optimize() dw$optimize( ngen = 100, parallel = TRUE, # Uses all available cores - 1 verbose = TRUE ) # Disable parallel computing options(darwin.parallel = FALSE) ``` ### Performance Comparison ```{r benchmark, eval=FALSE} # Benchmark (example, not run) library(microbenchmark) # Large dataset large_ref <- matrix(rnorm(50 * 2000), nrow = 50) microbenchmark( serial = { dw <- darwin(large_ref) dw$optimize(ngen = 10, parallel = FALSE, verbose = FALSE) }, parallel = { dw <- darwin(large_ref) dw$optimize(ngen = 10, parallel = TRUE, verbose = FALSE) }, times = 3 ) ``` ## Parameter Tuning ### Population Size Larger populations provide more diversity but slower convergence: ```{r pop-size, fig.cap="Effect of population size on Pareto front diversity."} results <- list() for (pop in c(30, 60, 100)) { dw_temp <- darwin(reference) dw_temp$optimize( ngen = 40, pop_size = pop, verbose = FALSE, parallel = FALSE ) results[[as.character(pop)]] <- data.frame( dw_temp$get_fitness(), pop_size = factor(pop) ) } df_pop <- do.call(rbind, results) ggplot(df_pop, aes(x = correlation, y = distance, color = pop_size)) + geom_point(alpha = 0.6) + scale_color_brewer(palette = "Set1") + labs( title = "Effect of Population Size", x = "Correlation", y = "Distance", color = "Population\nSize" ) + theme_minimal(base_size = 12) ``` ### Mutation Probability Higher mutation enables more exploration: ```{r mutation, fig.cap="Effect of mutation probability on solution diversity."} results_mut <- list() for (mut in c(0.001, 0.01, 0.05)) { dw_temp <- darwin(reference) dw_temp$optimize( ngen = 40, mutation_prob = mut, verbose = FALSE, parallel = FALSE ) pareto <- dw_temp$get_pareto() gene_counts <- sapply(pareto, sum) results_mut[[as.character(mut)]] <- data.frame( n_genes = gene_counts, mutation = factor(mut) ) } df_mut <- do.call(rbind, results_mut) ggplot(df_mut, aes(x = n_genes, fill = mutation)) + geom_density(alpha = 0.5) + scale_fill_brewer(palette = "Set1") + labs( title = "Effect of Mutation Probability on Gene Count", x = "Number of Selected Genes", y = "Density", fill = "Mutation\nProbability" ) + theme_minimal(base_size = 12) ``` ## Save and Load darwin objects can be saved and loaded for reproducibility: ```{r save-load} # Save temp_file <- tempfile(fileext = ".rds") dw$save(temp_file) # Load dw_loaded <- readRDS(temp_file) # Verify cat("Original Pareto size:", length(dw$get_pareto()), "\n") cat("Loaded Pareto size:", length(dw_loaded$get_pareto()), "\n") # Clean up unlink(temp_file) ``` ## Integration Examples ### With Seurat ```{r seurat-integration, eval=FALSE} library(Seurat) # From Seurat object with annotations dw <- darwin( seurat_obj, celltype_key = "cell_type", assay = "RNA", layer = "data", use_highly_variable = TRUE ) # Optimize and select dw$optimize(ngen = 100) dw$select() # Get marker genes markers <- dw$get_genes() ``` ### Export to Other Deconvolution Tools ```{r export, eval=FALSE} # Get selected genes for use with other tools genes <- dw$get_genes() selection <- dw$get_selection() # Create signature matrix for CIBERSORT signature_matrix <- reference[, selection] write.csv(signature_matrix, "signature_matrix.csv") # Or as gene list for MuSiC gene_list <- genes ``` ## Troubleshooting ### Common Issues 1. **Slow optimization**: Reduce `ngen`, `pop_size`, or enable `parallel = TRUE` 2. **Poor convergence**: Increase `ngen` or `pop_size` 3. **All solutions identical**: Increase `mutation_prob` 4. **Memory issues**: Use gene pre-selection with `use_highly_variable = TRUE` ### Diagnostic Checks ```{r diagnostics} # Check optimization quality fitness <- dw$get_fitness() cat("Diagnostic Summary:\n") cat(" Pareto front size:", nrow(fitness), "\n") cat(" Correlation range:", round(diff(range(fitness[,1])), 3), "\n") cat(" Distance range:", round(diff(range(fitness[,2])), 3), "\n") cat(" Gene count range:", range(sapply(dw$get_pareto(), sum)), "\n") ``` ## Session Info ```{r session} sessionInfo() ```