12. Avoiding Over-clustering Leads to Improved Hypothesis Generation for Downstream Analyses (Figure 3)
12_figure3.Rmd
suppressPackageStartupMessages({
library(recallreproducibility)
library(ggplot2)
library(patchwork)
library(grid)
})
mesenchymal_stem_cells_marker_genes <- c(
"Col6a3",
"Col1a1",
"Igfbp6",
"Pdgfra",
"C1s",
"Mfap5",
"Ecm1",
"Dcn",
"Dpep1"
)
skeletal_muscle_satellite_cell_marker_genes <- c(
"Des",
"Chodl",
"Myl12a",
"Asb5",
"Sdc4",
"Apoe",
"Musk",
"Myf5",
"Chrdl2",
"Notch3"
)
First, we load the limb muscle seurat object and scale it for plotting the heatmap.
limb_muscle <- readRDS("Limb_Musclecluster_results_seurat.rds")
limb_muscle <- Seurat::ScaleData(limb_muscle, features = rownames(limb_muscle))
Next, we sort the cell ontology classes factors so that the cell type with the largest number of cells is first and the cell type with the fewest smallest is last.
# sort levels by size of group (limb_muscle)
limb_muscle@meta.data$cell_ontology_class <- as.factor(limb_muscle@meta.data$cell_ontology_class)
sorted_limb_muscle_clusters <- names(sort(summary(as.factor(na.omit(limb_muscle@meta.data$cell_ontology_class))), decreasing = TRUE))
limb_muscle@meta.data$cell_ontology_class <- factor(limb_muscle@meta.data$cell_ontology_class, levels = sorted_limb_muscle_clusters)
We define a function for plotting the grid of UMAPs.
revision_fig3_scatter_plots <- function(tissue, tissue_name, legend_pos=c(0.6, 0.2)) {
louvain_default <- custom_scatter(tissue, "umap", group_by = "seurat_clusters", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2, label=FALSE) + Seurat::NoLegend()
louvain_recall <- custom_scatter(tissue, "umap", group_by = "recall_idents", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2) + Seurat::NoLegend()
cell_ontology <- custom_scatter(tissue, "umap", group_by = "cell_ontology_class", x_title = "UMAP 1", y_title = "UMAP 2", pt.size = 2) +
ggplot2::theme(legend.position = legend_pos,
legend.text = ggplot2::element_text(size=20)) +
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size=6), ncol = 1)) +
ggplot2::scale_colour_discrete(na.translate = F)
column_label_1 <- patchwork::wrap_elements(panel = grid::textGrob('Cell Ontology', gp = grid::gpar(fontsize = 64)))
column_label_2 <- patchwork::wrap_elements(panel = grid::textGrob('Seurat Default', gp = grid::gpar(fontsize = 64)))
column_label_3 <- patchwork::wrap_elements(panel = grid::textGrob('recall+ZIP', gp = grid::gpar(fontsize = 64, fontfamily = "Courier")))
umap_grid <- column_label_1 + column_label_2 + column_label_3 +
cell_ontology + louvain_default + louvain_recall +
patchwork::plot_layout(widths = c(5, 5, 5),
heights = c(1,3))
return(umap_grid)
}
We plot and save UMAP scatterplots of the Cell Ontology Class, the Seurat default clusters, and the recall clusters.
fig_3_umap_grid <- revision_fig3_scatter_plots(limb_muscle, "limb_muscle", legend_pos = c(0.1, 0.8))
ggplot2::ggsave("fig3_revision__umap_grid.png", fig_3_umap_grid, width = 4.4 * 2^11, height = 1 * 2^11, units = "px")
We define a function for plotting the volcano plots.
# new volcano plots
revision_fig3_volcano_plots <- function(tissue, tissue_name,
recall_cluster1, recall_cluster2,
default_cluster1, default_cluster2,
ymax=150,
y_increment = 10,
genes_to_label_left=c(),
genes_to_label_right=c()) {
volcano_plot <- function(seurat_obj, markers, cluster1, cluster2, title, logfc_thresh=1.0,
genes_to_label_left=c(),
genes_to_label_right=c()) {
markers$log10pval <- -log10(markers$p_val)
markers$Name <- rownames(markers)
p_val_thresh <- -log10(0.05 / dim(seurat_obj)[1])
markers$log10pval[markers$log10pval > ymax] <- ymax
markers$color <- "grey"
markers[(markers$avg_log2FC > 1) & (markers$log10pval > p_val_thresh),]$color <- "red"
markers[(markers$avg_log2FC < -1) & (markers$log10pval > p_val_thresh),]$color <- "blue"
p <- ggplot2::ggplot(markers, ggplot2::aes(x=avg_log2FC, y=log10pval, color = color)) +
ggplot2::geom_point() +
ggplot2::scale_colour_identity() +
ggrepel::geom_label_repel(data = markers %>% dplyr::filter(Name %in% genes_to_label_right), ggplot2::aes(label = Name),
min.segment.length = 0,
box.padding = 1.5,
point.size = 2,
size = 10,
force = 12,
xlim = c(14,19),
ylim = c(50,160),
hjust=0,
direction = "y",
max.overlaps = Inf) + # right side isn't showing half of the labels
ggrepel::geom_label_repel(data = markers %>% dplyr::filter(Name %in% genes_to_label_left), ggplot2::aes(label = Name),
min.segment.length = 0,
box.padding = 1.5,
point.size = 2,
size = 10,
force = 12,
seed = 123,
xlim = c(-14,-19),
ylim = c(50,160),
hjust=1,
direction = "y",
max.overlaps = Inf) + # left side isn't showing half of the labels
#gghighlight::gghighlight(log10pval > p_val_thresh) +
#gghighlight::gghighlight(abs(avg_log2FC) > logfc_thresh) +
ggplot2::xlim(-20,20) +
ggplot2::xlab("Average Log2-Fold Change") +
ggplot2::ggtitle(title) +
ggplot2::theme_bw() +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),
axis.title = ggplot2::element_text(size=40),
axis.text = ggplot2::element_text(size=32),
title = ggplot2::element_text(size=44),
legend.title = ggplot2::element_blank(),
legend.background = ggplot2::element_blank(),
legend.box.background = ggplot2::element_rect(colour = "black"),
legend.text = ggplot2::element_text(size=32),
legend.position = c(0.8, 0.6),
legend.key.size = ggplot2::unit(3, "cm")) +
ggplot2::geom_hline(ggplot2::aes(yintercept=p_val_thresh), size = 1, linetype = 'dashed', show.legend = TRUE) +
ggplot2::geom_vline(ggplot2::aes(xintercept=logfc_thresh), linetype = 'dashed', size = 1) +
ggplot2::geom_vline(ggplot2::aes(xintercept=-logfc_thresh), linetype = 'dashed', size = 1) +
ggplot2::scale_linetype_manual(values=c("dashed")) +
#scale_color_manual(values=c("blue", "red")) +
# annotate p-value threshold
ggplot2::annotate("segment", x = 13, xend = 15, y = 18, yend = p_val_thresh, colour = "black", linetype = "dashed", size = 2) +
ggplot2::annotate("label", x = 13, y = 25, label = "Adj. P-value = 0.05", size = 12) +
# annotate logFC thresholds
ggplot2::annotate("segment", x = 11.6, xend = 1.0, y = 40, yend = 60, colour = "black", linetype = "dashed", size = 2) +
ggplot2::annotate("label", x = 13, y = 40, label = "Avg. Log2-FC = 1.0", size = 12) +
ggplot2::annotate("segment", x = -11.6, xend = -1.0, y = 40, yend = 60, colour = "black", linetype = "dashed", size = 2) +
ggplot2::annotate("label", x = -13, y = 40, label = "Avg. Log2-FC = -1.0", size = 12) +
#geom_text_repel(data=subset(markers, abs(avg_log2FC) > logfc_thresh & p_val_adj < 0.05), aes(label = Name), size = 8)
ggplot2::scale_y_continuous(name = "-log10 P-value", limits = c(0, ymax))#,
#breaks = seq(from = 0, to = ymax, by = y_increment)
#labels = c(seq(from = 0, to = ymax - 100, by = y_increment), paste0('\u2265', ymax)))
return(p)
}
# re-index clusters to be one-based
tissue@meta.data$recall_idents <- as.factor(as.numeric(tissue@meta.data$recall_idents))
Seurat::Idents(tissue) <- tissue@meta.data$recall_idents
#DimPlot(tissue)
recall_markers <- Seurat::FindMarkers(tissue,
ident.1 = recall_cluster1,
ident.2 = recall_cluster2,
logfc.threshold = 0.0,
)
# highlighted markers
subset(recall_markers, abs(avg_log2FC) > 1.0 & p_val_adj < 0.05)
recall_title <- paste0("recall+ZIP: Cluster ", recall_cluster1, " vs Cluster ", recall_cluster2)
volcano_recall <- volcano_plot(tissue, recall_markers, recall_cluster1, recall_cluster2, recall_title,
genes_to_label_left = genes_to_label_left, genes_to_label_right = genes_to_label_right)
# re-index clusters to be one-based
tissue@meta.data$seurat_clusters <- as.factor(as.numeric(tissue@meta.data$seurat_clusters))
Seurat::Idents(tissue) <- tissue@meta.data$seurat_clusters
default_markers <- Seurat::FindMarkers(tissue,
ident.1 = default_cluster1,
ident.2 = default_cluster2,
logfc.threshold = 0.0,
)
# highlighted markers
subset(default_markers, abs(avg_log2FC) > 1.0 & p_val_adj < 0.05)
default_title <- paste0("Default: Cluster ", default_cluster1, " vs Cluster ", default_cluster2)
volcano_default <- volcano_plot(tissue, default_markers, default_cluster1, default_cluster2, default_title)
return(list("volcano_recall" = volcano_recall, "volcano_default" = volcano_default))
}
Finally, we do differential expression testing between the callback cluster corresponding to skeletal mucle satellite cells and the callback cluster corresponding to mesenchymal stem cell cluster as well as two of the Seurat clusters that correspond to the same cell types. Notice that the P-values produced after clustering with recall are more significant due to the increased sample size from correctly clustering the two cell types.
volcanos = revision_fig3_volcano_plots(limb_muscle, "limb_muscle", recall_cluster1 = 1,
recall_cluster2 = 2,
default_cluster1 = 1,
default_cluster2 = 3,
ymax=150,
y_increment = 100,
genes_to_label_left = mesenchymal_stem_cells_marker_genes,
genes_to_label_right = skeletal_muscle_satellite_cell_marker_genes)
ggplot2::ggsave("fig3_revision_limb_muscle_volcano_plot_default.png", volcanos$volcano_default, width = 2^12, height = 2^12, units = "px")
ggplot2::ggsave("fig3_revision_limb_muscle_volcano_plot_recall.png", volcanos$volcano_recall, width = 2^12, height = 2^12, units = "px")