This vignette covers advanced features of darwin for power users, including custom objective functions, fixed gene count mode, parallel computing, and integration with other tools.
# Create reference matrix
n_ct <- 5
n_genes <- 400
reference <- matrix(abs(rnorm(n_ct * n_genes, 2, 1)), nrow = n_ct, ncol = n_genes)
rownames(reference) <- paste0("CellType", 1:n_ct)
colnames(reference) <- paste0("Gene", 1:n_genes)
# Add markers
for (i in 1:n_ct) {
reference[i, ((i-1)*20+1):(i*20)] <- reference[i, ((i-1)*20+1):(i*20)] + 4
}darwin allows you to define custom objective functions for specialized applications.
A valid objective function must:
# Custom objective: maximize marker specificity
# Higher when genes are specific to individual cell types
marker_score <- function(data) {
# Max expression / mean expression ratio
col_max <- apply(data, 2, max)
col_mean <- colMeans(data)
col_mean[col_mean == 0] <- 1e-10
sum(col_max / col_mean)
}
cat("Marker score value:", marker_score(test_data), "\n")
#> Marker score value: 107.8375dw_custom <- darwin(reference)
dw_custom$optimize(
ngen = 60,
objectives = c("correlation", variance_objective),
weights = c(-1, 1), # Minimize correlation, maximize variance
verbose = FALSE,
parallel = FALSE
)
# The second objective column will show variance values
fitness <- dw_custom$get_fitness()
colnames(fitness) <- c("Correlation", "Variance")
ggplot(as.data.frame(fitness), aes(x = Correlation, y = Variance)) +
geom_point(color = "#3498db", size = 3, alpha = 0.7) +
geom_line(color = "gray60", alpha = 0.5) +
labs(
title = "Custom Objectives: Correlation vs Variance",
subtitle = "Minimize correlation, maximize variance",
x = "Correlation (lower is better)",
y = "Variance (higher is better)"
) +
theme_minimal(base_size = 12)Pareto front with custom objectives: correlation vs variance.
dw_three <- darwin(reference)
dw_three$optimize(
ngen = 60,
objectives = c("correlation", "distance", "condition"),
weights = c(-1, 1, -1), # Minimize corr/cond, maximize distance
verbose = FALSE,
parallel = FALSE
)
fitness3 <- dw_three$get_fitness()
cat("Three-objective optimization:\n")
#> Three-objective optimization:
cat(" Solutions:", nrow(fitness3), "\n")
#> Solutions: 314
cat(" Correlation range:", round(range(fitness3[,1]), 2), "\n")
#> Correlation range: 0.21 0.79
cat(" Distance range:", round(range(fitness3[,2]), 2), "\n")
#> Distance range: 143.43 363.86
cat(" Condition range:", round(range(fitness3[,3]), 2), "\n")
#> Condition range: 3.93 4.27For applications requiring a specific number of marker genes, use fixed mode:
dw_fixed <- darwin(reference)
dw_fixed$optimize(
ngen = 60,
mode = "fixed",
n_features = 50, # Select exactly 50 genes
objectives = c("correlation", "distance"),
weights = c(-1, 1),
verbose = FALSE,
parallel = FALSE
)
# Verify all solutions have 50 genes
pareto <- dw_fixed$get_pareto()
gene_counts <- sapply(pareto, sum)
cat("Gene counts in fixed mode:", unique(gene_counts), "\n")
#> Gene counts in fixed mode: 50
# Compare gene count distributions
df_compare <- rbind(
data.frame(
mode = "Standard",
n_genes = sapply(dw_custom$get_pareto(), sum)
),
data.frame(
mode = "Fixed (50)",
n_genes = gene_counts
)
)
ggplot(df_compare, aes(x = n_genes, fill = mode)) +
geom_histogram(bins = 20, alpha = 0.7, position = "identity") +
scale_fill_manual(values = c("#3498db", "#e74c3c")) +
labs(
title = "Gene Count Distribution: Standard vs Fixed Mode",
x = "Number of Selected Genes",
y = "Frequency"
) +
theme_minimal(base_size = 12)Fixed mode ensures all solutions have exactly the specified number of genes.
darwin provides flexible selection from the Pareto front:
dw <- darwin(reference)
dw$optimize(ngen = 50, verbose = FALSE, parallel = FALSE)
# Emphasize minimizing correlation
dw$select(weights = c(-2, 1))
cat("Emphasize correlation - genes:", sum(dw$get_selection()), "\n")
#> Emphasize correlation - genes: 388
# Emphasize maximizing distance
dw$select(weights = c(-1, 2))
cat("Emphasize distance - genes:", sum(dw$get_selection()), "\n")
#> Emphasize distance - genes: 388# Select by direct index
dw$select(index = 1)
cat("Solution 1 - genes:", sum(dw$get_selection()), "\n")
#> Solution 1 - genes: 400
# Select by objective rank
# (objective_index, rank) - rank 1 = best for that objective
dw$select(index = c(1, 1)) # Best correlation
cat("Best correlation - genes:", sum(dw$get_selection()), "\n")
#> Best correlation - genes: 326
dw$select(index = c(2, -1)) # Best distance (last rank)
cat("Best distance - genes:", sum(dw$get_selection()), "\n")
#> Best distance - genes: 400darwin supports parallel computation for faster optimization:
# Enable parallel computing
options(darwin.parallel = TRUE)
# Or specify in optimize()
dw$optimize(
ngen = 100,
parallel = TRUE, # Uses all available cores - 1
verbose = TRUE
)
# Disable parallel computing
options(darwin.parallel = FALSE)# Benchmark (example, not run)
library(microbenchmark)
# Large dataset
large_ref <- matrix(rnorm(50 * 2000), nrow = 50)
microbenchmark(
serial = {
dw <- darwin(large_ref)
dw$optimize(ngen = 10, parallel = FALSE, verbose = FALSE)
},
parallel = {
dw <- darwin(large_ref)
dw$optimize(ngen = 10, parallel = TRUE, verbose = FALSE)
},
times = 3
)Larger populations provide more diversity but slower convergence:
results <- list()
for (pop in c(30, 60, 100)) {
dw_temp <- darwin(reference)
dw_temp$optimize(
ngen = 40,
pop_size = pop,
verbose = FALSE,
parallel = FALSE
)
results[[as.character(pop)]] <- data.frame(
dw_temp$get_fitness(),
pop_size = factor(pop)
)
}
df_pop <- do.call(rbind, results)
ggplot(df_pop, aes(x = correlation, y = distance, color = pop_size)) +
geom_point(alpha = 0.6) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Effect of Population Size",
x = "Correlation",
y = "Distance",
color = "Population\nSize"
) +
theme_minimal(base_size = 12)Effect of population size on Pareto front diversity.
Higher mutation enables more exploration:
results_mut <- list()
for (mut in c(0.001, 0.01, 0.05)) {
dw_temp <- darwin(reference)
dw_temp$optimize(
ngen = 40,
mutation_prob = mut,
verbose = FALSE,
parallel = FALSE
)
pareto <- dw_temp$get_pareto()
gene_counts <- sapply(pareto, sum)
results_mut[[as.character(mut)]] <- data.frame(
n_genes = gene_counts,
mutation = factor(mut)
)
}
df_mut <- do.call(rbind, results_mut)
ggplot(df_mut, aes(x = n_genes, fill = mutation)) +
geom_density(alpha = 0.5) +
scale_fill_brewer(palette = "Set1") +
labs(
title = "Effect of Mutation Probability on Gene Count",
x = "Number of Selected Genes",
y = "Density",
fill = "Mutation\nProbability"
) +
theme_minimal(base_size = 12)Effect of mutation probability on solution diversity.
darwin objects can be saved and loaded for reproducibility:
# Save
temp_file <- tempfile(fileext = ".rds")
dw$save(temp_file)
# Load
dw_loaded <- readRDS(temp_file)
# Verify
cat("Original Pareto size:", length(dw$get_pareto()), "\n")
#> Original Pareto size: 99
cat("Loaded Pareto size:", length(dw_loaded$get_pareto()), "\n")
#> Loaded Pareto size: 99
# Clean up
unlink(temp_file)ngen,
pop_size, or enable parallel = TRUEngen or
pop_sizemutation_probuse_highly_variable = TRUE# Check optimization quality
fitness <- dw$get_fitness()
cat("Diagnostic Summary:\n")
#> Diagnostic Summary:
cat(" Pareto front size:", nrow(fitness), "\n")
#> Pareto front size: 99
cat(" Correlation range:", round(diff(range(fitness[,1])), 3), "\n")
#> Correlation range: 0.149
cat(" Distance range:", round(diff(range(fitness[,2])), 3), "\n")
#> Distance range: 47.774
cat(" Gene count range:", range(sapply(dw$get_pareto(), sum)), "\n")
#> Gene count range: 326 400sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ggplot2_4.0.3 darwin_1.0.0 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] Matrix_1.7-5 future.apply_1.20.2 gtable_0.3.6
#> [4] jsonlite_2.0.0 dplyr_1.2.1 compiler_4.6.0
#> [7] Rcpp_1.1.1-1.1 tidyselect_1.2.1 parallel_4.6.0
#> [10] jquerylib_0.1.4 globals_0.19.1 scales_1.4.0
#> [13] yaml_2.3.12 fastmap_1.2.0 lattice_0.22-9
#> [16] R6_2.6.1 labeling_0.4.3 generics_0.1.4
#> [19] knitr_1.51 future_1.70.0 tibble_3.3.1
#> [22] maketools_1.3.2 bslib_0.11.0 pillar_1.11.1
#> [25] RColorBrewer_1.1-3 rlang_1.2.0 cachem_1.1.0
#> [28] xfun_0.57 sass_0.4.10 sys_3.4.3
#> [31] S7_0.2.2 otel_0.2.0 cli_3.6.6
#> [34] withr_3.0.2 magrittr_2.0.5 digest_0.6.39
#> [37] grid_4.6.0 lifecycle_1.0.5 vctrs_0.7.3
#> [40] evaluate_1.0.5 glue_1.8.1 listenv_0.10.1
#> [43] farver_2.1.2 codetools_0.2-20 buildtools_1.0.0
#> [46] parallelly_1.47.0 tools_4.6.0 pkgconfig_2.0.3
#> [49] htmltools_0.5.9