--- title: "SpaGER: Visualization and Analysis" author: "Zaoqu Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{SpaGER: Visualization and Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, message = FALSE, warning = FALSE ) ``` ## Introduction This vignette demonstrates visualization techniques for SpaGER predictions and analyses, helping you interpret and present your results effectively. ```{r load} library(SpaGER) set.seed(42) ``` ## Simulated Dataset We create a more realistic simulated dataset with biological structure: ```{r simulate} # Parameters n_rna <- 500 n_spatial <- 200 n_shared <- 80 n_predict <- 40 # Create cell type structure (3 cell types) cell_types <- rep(1:3, length.out = n_rna) type_means <- c(3, 5, 7) # scRNA-seq with cell type-specific expression rna_base <- matrix(0, nrow = n_rna, ncol = n_shared + n_predict) for (i in 1:n_rna) { rna_base[i, ] <- rnorm(n_shared + n_predict, mean = type_means[cell_types[i]], sd = 1) } rna_base <- abs(rna_base) colnames(rna_base) <- c(paste0("Shared", 1:n_shared), paste0("Novel", 1:n_predict)) # Spatial data - mixture of cell types spatial_types <- sample(1:3, n_spatial, replace = TRUE, prob = c(0.5, 0.3, 0.2)) spatial_base <- matrix(0, nrow = n_spatial, ncol = n_shared) for (i in 1:n_spatial) { spatial_base[i, ] <- rnorm(n_shared, mean = type_means[spatial_types[i]], sd = 1.2) } spatial_base <- abs(spatial_base) colnames(spatial_base) <- paste0("Shared", 1:n_shared) # Add spatial coordinates coords <- data.frame( x = runif(n_spatial, 0, 100), y = runif(n_spatial, 0, 100) ) rownames(coords) <- paste0("Spot", 1:n_spatial) rownames(spatial_base) <- rownames(coords) cat("Data created:\n") cat(" scRNA-seq:", n_rna, "cells,", ncol(rna_base), "genes\n") cat(" Spatial:", n_spatial, "spots,", ncol(spatial_base), "genes\n") ``` ## Run SpaGE Prediction ```{r predict} predicted <- SpaGE( spatial_data = as.data.frame(spatial_base), rna_data = as.data.frame(rna_base), n_pv = 30, n_neighbors = 50, verbose = FALSE ) cat("Predicted", ncol(predicted), "novel genes\n") ``` ## Visualizing Principal Vector Selection ```{r pv_vis, fig.width=8, fig.height=4} # Get PV information similarities <- attr(predicted, "similarities") n_pv_used <- attr(predicted, "n_pv_used") par(mfrow = c(1, 2)) # Bar plot of similarities cols <- ifelse(seq_along(similarities) <= n_pv_used, "#3498db", "#bdc3c7") barplot(similarities, names.arg = seq_along(similarities), col = cols, border = "white", main = "Principal Vector Selection", xlab = "PV Index", ylab = "Cosine Similarity", ylim = c(0, 1)) abline(h = 0.3, col = "#e74c3c", lty = 2, lwd = 2) legend("topright", legend = c("Selected", "Rejected", "Threshold (0.3)"), fill = c("#3498db", "#bdc3c7", NA), border = c("white", "white", NA), lty = c(NA, NA, 2), col = c(NA, NA, "#e74c3c")) # Cumulative variance cum_var <- cumsum(similarities^2) / sum(similarities^2) plot(cum_var, type = "b", pch = 19, col = "#2ecc71", main = "Cumulative Variance Captured", xlab = "Number of PVs", ylab = "Cumulative Proportion", xlim = c(1, length(similarities))) abline(v = n_pv_used, col = "#e74c3c", lty = 2) abline(h = cum_var[n_pv_used], col = "#e74c3c", lty = 2) text(n_pv_used + 1, cum_var[n_pv_used], sprintf("%.1f%%", cum_var[n_pv_used] * 100), pos = 4) par(mfrow = c(1, 1)) ``` ## Spatial Expression Patterns ```{r spatial_vis, fig.width=8, fig.height=6} # Function to plot spatial expression plot_spatial <- function(coords, values, title, col_palette = NULL) { if (is.null(col_palette)) { col_palette <- colorRampPalette(c("#313695", "#4575b4", "#74add1", "#abd9e9", "#ffffbf", "#fee090", "#fdae61", "#f46d43", "#d73027"))(100) } # Normalize values to [0, 1] val_norm <- (values - min(values)) / (max(values) - min(values) + 1e-10) cols <- col_palette[ceiling(val_norm * 99) + 1] plot(coords$x, coords$y, pch = 19, cex = 1.5, col = cols, xlab = "X", ylab = "Y", main = title, asp = 1) } # Plot predicted genes par(mfrow = c(2, 2), mar = c(4, 4, 3, 1)) for (i in 1:4) { gene <- colnames(predicted)[i] plot_spatial(coords, predicted[, gene], paste("Predicted:", gene)) } par(mfrow = c(1, 1)) ``` ## Cross-Validation Results ```{r cv_analysis, fig.width=8, fig.height=5} # Run CV cv_results <- SpaGE_cv( spatial_data = as.data.frame(spatial_base), rna_data = as.data.frame(rna_base[, paste0("Shared", 1:n_shared)]), n_pv = 20, genes = paste0("Shared", 1:20), verbose = FALSE ) par(mfrow = c(1, 2)) # Histogram hist(cv_results$correlation, breaks = 15, col = "#3498db", border = "white", main = "Cross-Validation Performance", xlab = "Spearman Correlation", xlim = c(-0.2, 1)) abline(v = median(cv_results$correlation), col = "#e74c3c", lwd = 2, lty = 2) abline(v = mean(cv_results$correlation), col = "#2ecc71", lwd = 2, lty = 2) legend("topleft", legend = c(paste("Median:", round(median(cv_results$correlation), 3)), paste("Mean:", round(mean(cv_results$correlation), 3))), col = c("#e74c3c", "#2ecc71"), lty = 2, lwd = 2) # Per-gene plot cv_ordered <- cv_results[order(cv_results$correlation, decreasing = TRUE), ] barplot(cv_ordered$correlation, names.arg = substr(cv_ordered$gene, 7, 10), las = 2, col = ifelse(cv_ordered$correlation > 0.3, "#2ecc71", "#e74c3c"), border = "white", main = "Per-Gene Correlation", ylab = "Spearman Correlation") abline(h = 0.3, lty = 2) par(mfrow = c(1, 1)) ``` ## Measured vs Predicted Scatter Plots ```{r scatter, fig.width=8, fig.height=4} # For CV genes, compare measured vs predicted par(mfrow = c(1, 3)) for (i in 1:3) { gene <- cv_results$gene[i] # Get measured values measured <- spatial_base[, gene] # Run prediction for this gene spatial_minus <- spatial_base[, setdiff(colnames(spatial_base), gene)] pred_single <- SpaGE( spatial_data = as.data.frame(spatial_minus), rna_data = as.data.frame(rna_base[, paste0("Shared", 1:n_shared)]), n_pv = 20, genes_to_predict = gene, verbose = FALSE ) predicted_vals <- pred_single[, gene] cor_val <- cor(measured, predicted_vals, method = "spearman") plot(measured, predicted_vals, pch = 19, col = adjustcolor("#3498db", 0.5), xlab = "Measured", ylab = "Predicted", main = sprintf("%s (r = %.3f)", gene, cor_val)) abline(lm(predicted_vals ~ measured), col = "#e74c3c", lwd = 2) # Add identity line abline(0, 1, lty = 2, col = "gray50") } par(mfrow = c(1, 1)) ``` ## Expression Distribution Comparison ```{r distribution, fig.width=8, fig.height=4} par(mfrow = c(1, 2)) # Density plot for predicted genes gene1 <- colnames(predicted)[1] gene2 <- colnames(predicted)[2] # Use density estimation d1 <- density(predicted[, gene1]) d2 <- density(predicted[, gene2]) plot(d1, col = "#3498db", lwd = 2, main = "Predicted Expression Distribution", xlab = "Expression Level", xlim = range(c(d1$x, d2$x)), ylim = range(c(d1$y, d2$y))) lines(d2, col = "#e74c3c", lwd = 2) legend("topright", legend = c(gene1, gene2), col = c("#3498db", "#e74c3c"), lwd = 2) # Box plot boxplot(predicted[, 1:10], col = "#3498db", border = "#2c3e50", main = "Predicted Expression (First 10 Genes)", xlab = "Gene", ylab = "Expression", las = 2, names = substr(colnames(predicted)[1:10], 1, 8)) par(mfrow = c(1, 1)) ``` ## Correlation Heatmap ```{r heatmap, fig.width=7, fig.height=6} # Compute correlation matrix for predicted genes pred_subset <- predicted[, 1:15] cor_mat <- cor(pred_subset, method = "spearman") # Simple heatmap heatmap(cor_mat, col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(50), scale = "none", main = "Gene-Gene Correlation (Predicted)", margins = c(8, 8)) ``` ## Summary Statistics ```{r summary} cat("=== SpaGE Prediction Summary ===\n\n") cat("Input Data:\n") cat(sprintf(" Spatial spots: %d\n", nrow(spatial_base))) cat(sprintf(" scRNA-seq cells: %d\n", nrow(rna_base))) cat(sprintf(" Shared genes: %d\n", n_shared)) cat(sprintf(" Novel genes predicted: %d\n", ncol(predicted))) cat("\nPrincipal Vectors:\n") cat(sprintf(" Requested: %d\n", attr(predicted, "n_pv"))) cat(sprintf(" Used (sim > 0.3): %d\n", attr(predicted, "n_pv_used"))) cat(sprintf(" Mean similarity: %.3f\n", mean(attr(predicted, "similarities")))) cat("\nCross-Validation:\n") cat(sprintf(" Genes tested: %d\n", nrow(cv_results))) cat(sprintf(" Mean correlation: %.3f\n", mean(cv_results$correlation))) cat(sprintf(" Median correlation: %.3f\n", median(cv_results$correlation))) cat(sprintf(" Genes with r > 0.3: %d (%.1f%%)\n", sum(cv_results$correlation > 0.3), 100 * mean(cv_results$correlation > 0.3))) cat("\nPredicted Expression:\n") cat(sprintf(" Mean: %.3f\n", mean(as.matrix(predicted)))) cat(sprintf(" SD: %.3f\n", sd(as.matrix(predicted)))) cat(sprintf(" Range: [%.3f, %.3f]\n", min(predicted), max(predicted))) ``` ## Exporting Results ```{r export, eval=FALSE} # Export predicted expression write.csv(predicted, "spage_predictions.csv") # Export CV results write.csv(cv_results, "spage_cv_results.csv") # Export for visualization in other tools saveRDS(list( predictions = predicted, coordinates = coords, cv_results = cv_results, metadata = list( n_pv = attr(predicted, "n_pv"), n_pv_used = attr(predicted, "n_pv_used"), similarities = attr(predicted, "similarities") ) ), "spage_results.rds") ``` ## Session Information ```{r session} sessionInfo() ```