--- title: "Visualization Guide" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 fig_caption: true vignette: > %\VignetteIndexEntry{Visualization Guide} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ``` ## Introduction This vignette demonstrates various visualization techniques for analyzing darwin optimization results. Effective visualization is crucial for understanding the trade-offs in multi-objective optimization and selecting appropriate solutions. ## Setup ```{r load-packages} library(darwin) library(ggplot2) # Set seed for reproducibility set.seed(42) ``` ## Prepare Example Data ```{r create-data} # Create reference expression matrix n_celltypes <- 6 n_genes <- 300 reference <- matrix( abs(rnorm(n_celltypes * n_genes, mean = 2)), nrow = n_celltypes, ncol = n_genes ) rownames(reference) <- c("B_cells", "T_cells", "NK_cells", "Monocytes", "Dendritic", "Neutrophils") colnames(reference) <- paste0("Gene", 1:n_genes) # Add cell-type specific markers for (i in 1:n_celltypes) { markers <- ((i - 1) * 15 + 1):(i * 15) reference[i, markers] <- reference[i, markers] + runif(15, 3, 6) } # Run optimization dw <- darwin(reference) dw$optimize( ngen = 80, pop_size = 80, objectives = c("correlation", "distance"), weights = c(-1, 1), verbose = FALSE, parallel = FALSE ) ``` ## Pareto Front Visualization ### Basic Pareto Plot ```{r pareto-basic, fig.cap="Basic Pareto front visualization showing the trade-off between objectives."} dw$plot() ``` ### Customized Pareto Plot ```{r pareto-custom, fig.cap="Customized Pareto front with different highlighted solution."} # Highlight solution with best distance dw$plot( index = c(2, -1), # Objective 2, last rank (highest distance) point_size = 4, highlight_size = 7 ) ``` ### Manual Pareto Plot with ggplot2 ```{r pareto-ggplot, fig.cap="Fully customized Pareto front visualization."} # Get fitness data fitness <- dw$get_fitness() pareto <- dw$get_pareto() n_genes_per_solution <- sapply(pareto, sum) # Create data frame df <- data.frame( correlation = fitness$correlation, distance = fitness$distance, n_genes = n_genes_per_solution, solution_id = 1:nrow(fitness) ) # Custom plot ggplot(df, aes(x = correlation, y = distance)) + geom_point(aes(size = n_genes, color = n_genes), alpha = 0.7) + geom_line(color = "gray50", alpha = 0.5) + scale_color_viridis_c(option = "plasma") + scale_size_continuous(range = c(2, 8)) + labs( title = "Pareto Front Analysis", subtitle = paste(nrow(df), "Pareto-optimal solutions"), x = "Correlation (lower is better)", y = "Distance (higher is better)", color = "Number of\nGenes", size = "Number of\nGenes" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), legend.position = "right" ) ``` ## Gene Selection Analysis ### Gene Count Distribution ```{r gene-count, fig.cap="Distribution of selected gene counts across Pareto-optimal solutions."} df_genes <- data.frame(n_genes = n_genes_per_solution) ggplot(df_genes, aes(x = n_genes)) + geom_histogram(bins = 20, fill = "#3498db", color = "white", alpha = 0.8) + geom_vline(xintercept = median(n_genes_per_solution), color = "#e74c3c", linetype = "dashed", linewidth = 1) + annotate("text", x = median(n_genes_per_solution) + 5, y = Inf, label = paste("Median:", median(n_genes_per_solution)), vjust = 2, color = "#e74c3c") + labs( title = "Distribution of Selected Gene Counts", subtitle = "Across all Pareto-optimal solutions", x = "Number of Selected Genes", y = "Frequency" ) + theme_minimal(base_size = 12) ``` ### Fitness vs Gene Count ```{r fitness-vs-genes, fig.cap="Relationship between number of genes and objective values."} library(ggplot2) # Long format for faceting df_long <- rbind( data.frame(n_genes = n_genes_per_solution, value = fitness$correlation, objective = "Correlation"), data.frame(n_genes = n_genes_per_solution, value = fitness$distance, objective = "Distance") ) ggplot(df_long, aes(x = n_genes, y = value)) + geom_point(alpha = 0.6, color = "#3498db") + geom_smooth(method = "loess", se = TRUE, color = "#e74c3c") + facet_wrap(~objective, scales = "free_y") + labs( title = "Objectives vs Number of Selected Genes", x = "Number of Selected Genes", y = "Objective Value" ) + theme_minimal(base_size = 12) ``` ## Expression Profile Visualization ### Heatmap of Selected Genes ```{r heatmap, fig.cap="Expression heatmap of selected marker genes."} # Select a solution dw$select(weights = c(-1, 1)) selection <- dw$get_selection() selected_data <- reference[, selection] # For visualization, show top 50 most variable genes gene_vars <- apply(selected_data, 2, var) top_genes <- names(sort(gene_vars, decreasing = TRUE))[1:min(50, ncol(selected_data))] plot_data <- selected_data[, top_genes] # Scale for visualization plot_data_scaled <- t(scale(t(plot_data))) # Convert to long format df_heat <- expand.grid( CellType = rownames(plot_data_scaled), Gene = colnames(plot_data_scaled) ) df_heat$Expression <- as.vector(plot_data_scaled) ggplot(df_heat, aes(x = Gene, y = CellType, fill = Expression)) + geom_tile() + scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c", midpoint = 0, limits = c(-3, 3), oob = scales::squish) + labs( title = "Expression Heatmap of Selected Genes", subtitle = paste("Top", length(top_genes), "most variable genes"), x = "Genes", y = "Cell Type", fill = "Scaled\nExpression" ) + theme_minimal(base_size = 10) + theme( axis.text.x = element_blank(), axis.ticks.x = element_blank() ) ``` ### Cell Type Similarity ```{r similarity, fig.cap="Cell type similarity based on selected genes."} # Compute correlation matrix corr_selected <- cor(t(selected_data)) df_corr <- expand.grid( CT1 = rownames(corr_selected), CT2 = colnames(corr_selected) ) df_corr$Correlation <- as.vector(corr_selected) ggplot(df_corr, aes(x = CT1, y = CT2, fill = Correlation)) + geom_tile() + geom_text(aes(label = round(Correlation, 2)), size = 3) + scale_fill_gradient2(low = "#3498db", mid = "white", high = "#e74c3c", midpoint = 0, limits = c(-1, 1)) + labs( title = "Cell Type Correlation Matrix", subtitle = "Based on selected marker genes", x = "", y = "" ) + theme_minimal(base_size = 12) + coord_fixed() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` ## Solution Comparison ### Compare Different Selection Methods ```{r compare-methods, fig.cap="Comparing solutions from different selection methods."} # Get multiple solutions dw$select(weights = c(-1, 1)) sol_weighted <- dw$get_selection() dw$select(index = 1) sol_first <- dw$get_selection() dw$select(index = c(1, 1)) # Best correlation sol_best_corr <- dw$get_selection() dw$select(index = c(2, -1)) # Best distance sol_best_dist <- dw$get_selection() # Compare comparison <- data.frame( Method = c("Weighted", "First", "Best Correlation", "Best Distance"), N_Genes = c(sum(sol_weighted), sum(sol_first), sum(sol_best_corr), sum(sol_best_dist)), Correlation = c( compute_correlation(reference[, sol_weighted]), compute_correlation(reference[, sol_first]), compute_correlation(reference[, sol_best_corr]), compute_correlation(reference[, sol_best_dist]) ), Distance = c( compute_distance(reference[, sol_weighted]), compute_distance(reference[, sol_first]), compute_distance(reference[, sol_best_corr]), compute_distance(reference[, sol_best_dist]) ) ) knitr::kable(comparison, digits = 2, caption = "Comparison of different selection methods") ``` ```{r compare-plot, fig.cap="Visual comparison of selection methods on the Pareto front."} # Plot comparison df_comp <- data.frame( Correlation = comparison$Correlation, Distance = comparison$Distance, Method = comparison$Method ) ggplot(df, aes(x = correlation, y = distance)) + geom_point(color = "gray70", alpha = 0.5, size = 2) + geom_line(color = "gray70", alpha = 0.5) + geom_point(data = df_comp, aes(x = Correlation, y = Distance, color = Method), size = 5) + scale_color_brewer(palette = "Set1") + labs( title = "Solution Comparison on Pareto Front", x = "Correlation", y = "Distance" ) + theme_minimal(base_size = 12) + theme(legend.position = "bottom") ``` ## Advanced: 3D Pareto Front For three objectives, we can visualize in 3D: ```{r 3d-example, eval=FALSE} # Example with 3 objectives (not run) dw3 <- darwin(reference) dw3$optimize( ngen = 50, objectives = c("correlation", "distance", "condition"), weights = c(-1, 1, -1), verbose = FALSE ) # Would use plotly for 3D visualization # library(plotly) # fitness3 <- dw3$get_fitness() # plot_ly(fitness3, x = ~correlation, y = ~distance, z = ~condition, # type = "scatter3d", mode = "markers") ``` ## Summary Effective visualization of darwin results helps in: 1. **Understanding trade-offs** between objectives 2. **Comparing** different selection strategies 3. **Validating** that selected genes provide good cell type separation 4. **Communicating** results to collaborators ## Session Info ```{r session} sessionInfo() ```