15. Marker Gene Method Comparison (Supplemental Figure S39) • recallreproducibility

suppressPackageStartupMessages({
library(ggplot2)
library(ggVennDiagram)

library(Seurat)})
library(countsplit)
library(ClusterDE)

We begin by defining several functions. First, the clusterDE workflow.

clusterDE_workflow <- function(seurat_obj, cluster1, cluster2, cores) {
  set.seed(1)
  
  original_markers <- FindMarkers(seurat_obj, 
                                  ident.1 = cluster1, 
                                  ident.2 = cluster2, 
                                  min.pct = 0, 
                                  logfc.threshold = 0)
  
  seurat_obj_sub <- subset(x = seurat_obj, idents = c(cluster1, cluster2))
  count_mat <- GetAssayData(object = seurat_obj_sub, slot = "counts")
  synthetic_null <- ClusterDE::constructNull(count_mat, nCores = cores, fastVersion = TRUE)
  
  seurat_obj_null <- CreateSeuratObject(counts = synthetic_null)
  seurat_obj_null <- NormalizeData(seurat_obj_null)
  seurat_obj_null <- FindVariableFeatures(object = seurat_obj_null)
  seurat_obj_null <- ScaleData(object = seurat_obj_null)
  seurat_obj_null <- RunPCA(object = seurat_obj_null)
  seurat_obj_null <- FindNeighbors(object = seurat_obj_null)
  
  resolution <- 0.1
  while (length(levels(Idents(seurat_obj_null))) == 1) {
    resolution <- resolution + 0.1
    seurat_obj_null <- FindClusters(object = seurat_obj_null, resolution = resolution)
    print("Num clusters")
    print(length(levels(Idents(seurat_obj_null))))
  }
  
  null_markers <- FindMarkers(seurat_obj_null, 
                              ident.1 = 0, 
                              ident.2 = 1, 
                              min.pct = 0, 
                              logfc.threshold = 0)
  
  original_pval <- original_markers$p_val
  names(original_pval) <- rownames(original_markers)
  
  null_pval <- null_markers$p_val
  names(null_pval) <- rownames(null_markers)
  
  res <- ClusterDE::callDE(original_pval, null_pval, nlogTrans = TRUE, FDR = 0.05)
  
  return(res)
}

Then we write a function for the countsplit workflow.

countsplit_workflow <- function(seurat_obj) {
  set.seed(1)
  
  counts_mat <- GetAssayData(seurat_obj, "RNA")
  
  split <- countsplit(counts_mat)
  Xtrain <- split[[1]]
  Xtest <- split[[2]]
  
  seurat_obj_train <- Seurat::CreateSeuratObject(counts = Xtrain)
  seurat_obj_test <- Seurat::CreateSeuratObject(counts = Xtest)
  
  
  # process training data
  seurat_obj_train <- Seurat::NormalizeData(seurat_obj_train,
                                            verbose = FALSE)
  
  seurat_obj_train <- Seurat::FindVariableFeatures(seurat_obj_train,
                                                   selection.method = "vst",
                                                   verbose = FALSE)
  
  seurat_obj_train <- Seurat::ScaleData(seurat_obj_train, verbose = FALSE)
  seurat_obj_train <- Seurat::RunPCA(seurat_obj_train,
                                     features = Seurat::VariableFeatures(object = seurat_obj_train),
                                     verbose = FALSE)
  
  seurat_obj_train <- Seurat::FindNeighbors(seurat_obj_train,
                                            verbose = FALSE)
  
  
  seurat_obj_train <- Seurat::FindClusters(seurat_obj_train,
                                            resolution = 0.8,
                                            verbose = FALSE)
  
  seurat_obj_train <- Seurat::RunUMAP(seurat_obj_train, dims=1:10)
  

  seurat_obj_test <- Seurat::NormalizeData(seurat_obj_test,
                                           verbose = FALSE)
  
  seurat_obj_test <- Seurat::FindVariableFeatures(seurat_obj_test,
                                                  selection.method = "vst",
                                                  verbose = FALSE)
  
  seurat_obj_test <- Seurat::ScaleData(seurat_obj_test, verbose = FALSE)
  
  seurat_obj_test <- Seurat::RunPCA(seurat_obj_test,
                                     features = Seurat::VariableFeatures(object = seurat_obj_test),
                                     verbose = FALSE)
  
  seurat_obj_test <- Seurat::FindNeighbors(seurat_obj_test,
                                            verbose = FALSE)
  
  seurat_obj_test <- Seurat::RunUMAP(seurat_obj_test, dims=1:10)
  
  Idents(seurat_obj_test) <- Idents(seurat_obj_train)
  seurat_obj_test@meta.data$cell_ontology_class <- seurat_obj@meta.data$cell_ontology_class
  
  seurat_obj_train@meta.data$cell_ontology_class <- seurat_obj@meta.data$cell_ontology_class
  
  
  return(list("seurat_obj_train"=seurat_obj_train, "seurat_obj_test"=seurat_obj_test))
}

We write function for plotting the UMAPs.

DE_comparisons_scatter_plots <- function(tissue, tissue_name, legend_pos=c(0.2, 0.2)) {  
  louvain_default <- custom_scatter(tissue, "umap", group_by = "seurat_clusters", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2, label=FALSE) #+ Seurat::NoLegend()
  louvain_recall <- custom_scatter(tissue, "umap", group_by = "recall_idents", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2) #+ Seurat::NoLegend() 
  louvain_countsplit <- custom_scatter(tissue, "umap", group_by = "countsplit_idents", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2) #+ Seurat::NoLegend() 
  cell_ontology <- custom_scatter(tissue, "umap", group_by = "cell_ontology_class", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2) + 
    ggplot2::theme(legend.position = legend_pos,
                   legend.text = ggplot2::element_text(size=20)) + 
    ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size=6), ncol = 1)) + 
    ggplot2::scale_colour_discrete(na.translate = F)
  
  column_label_1 <- patchwork::wrap_elements(panel = grid::textGrob('Cell Ontology', gp = grid::gpar(fontsize = 64)))
  column_label_2 <- patchwork::wrap_elements(panel = grid::textGrob('Seurat Default', gp = grid::gpar(fontsize = 64)))
  column_label_3 <- patchwork::wrap_elements(panel = grid::textGrob('recall+ZIP', gp = grid::gpar(fontsize = 64, fontfamily = "Courier")))
  column_label_4 <- patchwork::wrap_elements(panel = grid::textGrob('countsplit', gp = grid::gpar(fontsize = 64, fontfamily = "Courier")))
  
  
  umap_grid <- column_label_1 + column_label_2 + column_label_3 + column_label_4 +
    cell_ontology + louvain_default + louvain_recall + louvain_countsplit + 
    patchwork::plot_layout(widths = c(5, 5, 5, 5),
                           heights = c(1,3))
  
  return(umap_grid)
}

We write function for plotting the P-values.

compare_markers_p_vals <- function(default_markers_a, default_markers_b, xlabel, ylabel, title, xmax, ymax) {
  # sort genes alphabetically
  default_markers_a <- default_markers_a[ order(row.names(default_markers_a)), ]
  default_markers_b <- default_markers_b[ order(row.names(default_markers_b)), ]
  
  log_p_val_a <- -log(default_markers_a$p_val)
  log_p_val_b <- -log(default_markers_b$p_val)
  
  df <- data.frame(log_p_val_a, log_p_val_b)
  
  r_p_val <- cor(log_p_val_a, log_p_val_b)
  
  large_text_size <- 32
  small_text_size <- 24
  
  correlation_text_size <- 16
  
  p_value_scatterplot <- ggplot2::ggplot(df, ggplot2::aes(x = log_p_val_a, y = log_p_val_b)) + 
    ggplot2::geom_point() + 
    ggplot2::theme_bw() +
    xlim(0, xmax) +
    ylim(0, ymax) +
    ggplot2::xlab(xlabel) +
    ggplot2::ylab(ylabel) +
    ggplot2::annotate("text", x = 25, y = ymax * 0.82, label = paste0("r = ", round(r_p_val, 3)), size = correlation_text_size) + 
    ggplot2::theme(axis.text = ggplot2::element_text(size = small_text_size),
                   axis.title = ggplot2::element_text(size = large_text_size),
                   plot.title = ggplot2::element_text(size = large_text_size, hjust = 0.5)) +
    ggtitle(title)
  
  return(p_value_scatterplot)  
}

We write function for plotting the q-values.

compare_markers_q_vals <- function(default_markers_a, default_markers_b, xlabel, ylabel, title, xmax, ymax) {
  # convert clusterDE results tibble to dataframe
  default_markers_b <-  data.frame(default_markers_b)
  # make rownames the Genes
  rownames(default_markers_b) <- default_markers_b$Gene
  
  # sort genes alphabetically
  default_markers_a <- default_markers_a[ order(row.names(default_markers_a)), ]
  default_markers_b <- default_markers_b[ order(row.names(default_markers_b)), ]
  
  log_p_val_a <- -log(default_markers_a$q)
  log_p_val_b <- -log(default_markers_b$p_val)
  
  df <- data.frame(log_p_val_a, log_p_val_b)
  
  r_p_val <- cor(log_p_val_a, log_p_val_b)
  
  large_text_size <- 32
  small_text_size <- 24
  
  correlation_text_size <- 16
  
  p_value_scatterplot <- ggplot2::ggplot(df, ggplot2::aes(x = log_p_val_a, y = log_p_val_b)) + 
    ggplot2::geom_point() + 
    ggplot2::theme_bw() +
    xlim(0, xmax) +
    ylim(0, ymax) +
    ggplot2::xlab(xlabel) +
    ggplot2::ylab(ylabel) +
    ggplot2::annotate("text", x = 2, y = 340, label = paste0("r = ", round(r_p_val, 3)), size = correlation_text_size) + 
    ggplot2::theme(axis.text = ggplot2::element_text(size = small_text_size),
                   axis.title = ggplot2::element_text(size = large_text_size),
                   plot.title = ggplot2::element_text(size = large_text_size, hjust = 0.5)) +
    ggtitle(title)
  
  return(p_value_scatterplot)  
}

We write function for plotting the marker gene Venn diagram.

marker_gene_venn_diagram <- function(recall_marker_genes, countsplit_marker_genes, clusterDE_marker_genes) {
  p <- ggVennDiagram(list("recall+ZIP"=recall_marker_genes,
                          "countsplit"=countsplit_marker_genes,
                          "clusterDE"=clusterDE_marker_genes),
                     label = "count") +
    scale_fill_gradient(low = "white", high = "grey") + 
    guides(fill="none") + 
    scale_x_continuous(expand = expansion(mult = .2))
  
  return(p)
}

Now, we load the limb muscle tissue.

limb_muscle <- readRDS("Limb_Musclecluster_results_seurat.rds")
limb_muscle <- Seurat::ScaleData(limb_muscle, features = rownames(limb_muscle))

# make cell type labels a factor
limb_muscle@meta.data$cell_ontology_class <- as.factor(limb_muscle@meta.data$cell_ontology_class)
#sort cluster labels by cluster size
sorted_limb_muscle_clusters <- names(sort(summary(as.factor(na.omit(limb_muscle@meta.data$cell_ontology_class))), decreasing = TRUE))
limb_muscle@meta.data$cell_ontology_class <- factor(limb_muscle@meta.data$cell_ontology_class, levels = sorted_limb_muscle_clusters)

Now, we run the countsplit workflow.

countsplit_res <- countsplit_workflow(limb_muscle)


limb_muscle_test <- countsplit_res$seurat_obj_test

limb_muscle@meta.data$countsplit_idents <- Idents(limb_muscle_test)

Plot the UMAP of different clusterings used in this figure.

umap_grid <- DE_comparisons_scatter_plots(limb_muscle, "Limb Muscle")

ggsave("DE_marker_comparison_umap_grid.png", umap_grid, width = 36, height = 10, units = "in")

Get marker genes for recall and countsplit

# for countsplit cluster 0 is "skeletal muscle satellite cell" and cluster 1 is "mesenchymal stem cell"
countsplit_markers_0vs1 <- FindMarkers(limb_muscle_test, ident.1 = 0, ident.2 = 1, min.pct = 0, logfc.threshold = 0)

# for recall cluster 0 is "skeletal muscle satellite cell" and cluster 1 is "mesenchymal stem cell"
recall_markers_0vs1 <- FindMarkers(limb_muscle, ident.1 = 0, ident.2 = 1, group.by = "recall_idents", min.pct = 0, logfc.threshold = 0)

Run the ClusterDE workflow

# for seurat default cluster 1 is "skeletal muscle satellite cell" and cluster 2 is "mesenchymal stem cell"
clusterDE_res <- clusterDE_workflow(limb_muscle, 1, 2, cores=1)

Plot the p-value and q-value scatterplots.

countsplit_vs_recall_markers_plot <- compare_markers_p_vals(countsplit_markers_0vs1,
                                                              recall_markers_0vs1,
                                                              xlabel = "countsplit -log P-values",
                                                              ylabel = "recall+ZIP -log P-values",
                                                              title = "Skeletal muscle satellite cell cluster vs\nMesenchymal stem cell cluster",
                                                              xmax = 200,
                                                              ymax = 400)



clusterDE_vs_recall_markers_plot <- compare_markers_q_vals(clusterDE_res$summaryTable, recall_markers_0vs1,
                                                             "clusterDE -log q-values",
                                                             "recall+ZIP -log P-values",
                                                             "Skeletal muscle satellite cell cluster vs\nMesenchymal stem cell cluster",
                                                             xmax = 10,
                                                             ymax = 400)


ggsave("countsplit_vs_recall_markers_plot.png", countsplit_vs_recall_markers_plot, width = 12, height = 12, units = "in")
ggsave("clusterDE_vs_recall_markers_plot.png", clusterDE_vs_recall_markers_plot, width = 12, height = 12, units = "in")

Plot the Venn diagram

countsplit_signficant <- countsplit_markers_0vs1[countsplit_markers_0vs1$p_val_adj < 0.05, ]
recall_signficant <- recall_markers_0vs1[recall_markers_0vs1$p_val_adj < 0.05, ]

marker_venn_diagram <- marker_gene_venn_diagram(rownames(recall_signficant), rownames(countsplit_signficant), clusterDE_res$DEgenes)

ggsave("marker_venn_diagram.png", marker_venn_diagram)

Finally, we plot the P-value comparison for recall and countsplit with recall clusters downsampled.

Idents(limb_muscle) <- limb_muscle@meta.data$recall_idents
cluster_0_downsampled_indices <- WhichCells(limb_muscle, idents = c("0"), downsample = 271)
cluster_1_downsampled_indices <- WhichCells(limb_muscle, idents = c("1"), downsample = 192)

recall_downsampled_indices <- c(cluster_0_downsampled_indices, cluster_1_downsampled_indices)

limb_muscle_downsampled <- limb_muscle[, recall_downsampled_indices]

recall_downsampled_markers <- FindMarkers(limb_muscle_downsampled,ident.1 = 0, ident.2 = 1, min.pct = 0, logfc.threshold = 0)

recall_downsampled_vs_counsplit_markers_plot <- compare_markers_p_vals(countsplit_markers_0vs1,
                                                                         recall_downsampled_markers,
                                                                         xlabel = "countsplit",
                                                                         ylabel = "recall+ZIP (downsampled cluster sizes)",
                                                                         title = "Skeletal muscle satellite cell cluster vs\nMesenchymal stem cell cluster",
                                                                         xmax = 200,
                                                                         ymax = 200)

ggsave("recall_downsampled_vs_counsplit_markers_plot.png", recall_downsampled_vs_counsplit_markers_plot, width = 12, height = 12, units = "in")