2. Clustering Tabula Muris Tissues
2_cluster-tabula-muris.Rmd
suppressPackageStartupMessages({
library(callbackreproducibility)
library(dplyr)
library(Seurat)
library(ggplot2)
library(callback)
library(scSHC)
library(CHOIR)
})
set.seed(123456)
The dataset used here for benchmarking can be found on Figshare.
The key files are annotations_facs.csv
which has the Cell
Ontology Class labels that we are using as a gold standard and
FACS.zip
which contains CSV files containing the counts
matrix for each tissue.
First, we set up a function for loading the Tabula Muris tissue data. Note that the data folders will need to point to where the data are located after downloading it from Figshare.
data_folder <- "tabula_muris/data/5829687/" # the directory containing annotations_facs.csv
tissue_folder <- "tabula_muris/data/5829687/FACS/" # the directory containing the files in FACS.zip
read_tabula_muris_data <- function(file) {
data <- read.csv(file, row.names = 1)
seurat_obj <- CreateSeuratObject(counts = data, min.features = 500)
seurat_obj <- subset(seurat_obj, subset = nCount_RNA > 50000)
annotations <- read.csv(paste0(data_folder, 'annotations_facs.csv'))
rownames(annotations) <- annotations$cell
seurat_obj[["cell_ontology_class"]] <- annotations[Cells(seurat_obj),]$cell_ontology_class
return(seurat_obj)
}
Then, we loop over each tissue and cluster using
callback
, sc-SHC
, and CHOIR
and
save the resulting Seurat
objects for each tissue
c(ontaining the cluster labels for each method) to RDS files.
tissue_files <- list.files(tissue_folder, pattern = "csv")
cores <- 6
num_PCs <- 10
timing_df <- data.frame()
for (tissue_csv in tissue_files) {
tissue_name <- unlist(strsplit(tissue_csv, "-counts.csv"))[1]
print("Processing:")
print(tissue_name)
print("Loading tissue")
tissue <- read_tabula_muris_data(paste0(tissue_folder, tissue_csv))
print("Running initial seurat workflow")
p <- 1000
tissue <- seurat_workflow(tissue, num_variable_features = p, algorithm="louvain", resolution_param = 0.8)
print("Running callback")
callback_start_time <- Sys.time()
callback_results_obj <- FindClustersCallback(tissue, cores=cores, dims = 1:num_PCs)
callback_end_time <- Sys.time()
print("Running scSHC")
scSHC_start_time <- Sys.time()
scSHC_clusters <- scSHC(GetAssayData(tissue,
assay = "RNA", layer = "counts")[Seurat::VariableFeatures(tissue),],
num_features = 1000,
num_PCs = num_PCs,
cores = cores)
scSHC_end_time <- Sys.time()
print("Running CHOIR")
CHOIR_start_time <- Sys.time()
tissue <- CHOIR(tissue,
n_cores = cores,
reduction = tissue@reductions$pca@cell.embeddings[, 1:num_PCs],
var_features = Seurat::VariableFeatures(tissue))
CHOIR_end_time <- Sys.time()
tissue[["CHOIR_clusters"]] <- tissue@meta.data$CHOIR_clusters_0.05
tissue[["callback_idents"]] <- Idents(callback_results_obj)
tissue[['scSHC_clusters']] <- scSHC_clusters[[1]]
print("Saving result to rds file")
saveRDS(tissue, file = paste0(tissue_name, "cluster_results_seurat.rds"))
callback_time_taken <- difftime(callback_end_time, callback_start_time, units="mins")
scSHC_time_taken <- difftime(scSHC_end_time, scSHC_start_time, units="mins")
CHOIR_time_taken <- difftime(CHOIR_end_time, CHOIR_start_time, units="mins")
tissue <- c(tissue_name, tissue_name, tissue_name)
method <- c("callback", "sc-SHC", "CHOIR")
time <- c(callback_time_taken, scSHC_time_taken, CHOIR_time_taken)
timing_df_new_row <- data.frame(tissue, method, time)
print(timing_df_new_row)
timing_df <- rbind(timing_df, timing_df_new_row)
}
Finally, we save the runtime data.