Skip to contents

The dataset used here for benchmarking can be found on Figshare. The key files are annotations_facs.csv which has the Cell Ontology Class labels that we are using as a gold standard and FACS.zip which contains CSV files containing the counts matrix for each tissue.

First, we set up a function for loading the Tabula Muris tissue data. Note that the data folders will need to point to where the data are located after downloading it from Figshare.

data_folder <- "tabula_muris/data/5829687/"        # the directory containing annotations_facs.csv
tissue_folder <- "tabula_muris/data/5829687/FACS/" # the directory containing the files in FACS.zip

read_tabula_muris_data <- function(file) {
  data <- read.csv(file, row.names = 1)
  seurat_obj <- CreateSeuratObject(counts = data, min.features = 500)
  seurat_obj <- subset(seurat_obj, subset = nCount_RNA > 50000)
  
  
  annotations <- read.csv(paste0(data_folder, 'annotations_facs.csv'))
  rownames(annotations) <- annotations$cell
  
  seurat_obj[["cell_ontology_class"]] <- annotations[Cells(seurat_obj),]$cell_ontology_class
  
  return(seurat_obj)
}

Then, we loop over each tissue and cluster using callback, sc-SHC, and CHOIR and save the resulting Seurat objects for each tissue c(ontaining the cluster labels for each method) to RDS files.

tissue_files <- list.files(tissue_folder, pattern = "csv")

cores <- 6
num_PCs <- 10

timing_df <- data.frame()

for (tissue_csv in tissue_files) {
  tissue_name <- unlist(strsplit(tissue_csv, "-counts.csv"))[1]
  
  print("Processing:")
  print(tissue_name)
  
  print("Loading tissue")
  tissue <- read_tabula_muris_data(paste0(tissue_folder, tissue_csv))

  print("Running initial seurat workflow")
  p <- 1000
  tissue <- seurat_workflow(tissue, num_variable_features = p, algorithm="louvain", resolution_param = 0.8)

  print("Running callback")
  callback_start_time <- Sys.time()
  callback_results_obj <- FindClustersCallback(tissue, cores=cores, dims = 1:num_PCs)
  callback_end_time <- Sys.time()

  print("Running scSHC")
  scSHC_start_time <- Sys.time()
  scSHC_clusters <- scSHC(GetAssayData(tissue,
                                       assay = "RNA", layer = "counts")[Seurat::VariableFeatures(tissue),],
                                       num_features = 1000,
                                       num_PCs = num_PCs,
                                       cores = cores)
  scSHC_end_time <- Sys.time()

  print("Running CHOIR")
  CHOIR_start_time <- Sys.time()
  tissue <- CHOIR(tissue, 
                  n_cores = cores,
                  reduction = tissue@reductions$pca@cell.embeddings[, 1:num_PCs],
                  var_features = Seurat::VariableFeatures(tissue))
  CHOIR_end_time <- Sys.time()

  tissue[["CHOIR_clusters"]] <- tissue@meta.data$CHOIR_clusters_0.05
  tissue[["callback_idents"]] <- Idents(callback_results_obj)
  tissue[['scSHC_clusters']] <- scSHC_clusters[[1]]

  print("Saving result to rds file")
  saveRDS(tissue, file =  paste0(tissue_name, "cluster_results_seurat.rds"))

  callback_time_taken <- difftime(callback_end_time, callback_start_time, units="mins")
  scSHC_time_taken <- difftime(scSHC_end_time, scSHC_start_time, units="mins")
  CHOIR_time_taken <- difftime(CHOIR_end_time, CHOIR_start_time, units="mins")

  tissue <- c(tissue_name, tissue_name, tissue_name)
  method <- c("callback", "sc-SHC", "CHOIR")
  time <- c(callback_time_taken, scSHC_time_taken, CHOIR_time_taken)
  
  timing_df_new_row <- data.frame(tissue, method, time)
  
  print(timing_df_new_row)
  
  timing_df <- rbind(timing_df, timing_df_new_row)
}

Finally, we save the runtime data.

print(timing_df)
write.csv(timing_df, "tissue_timing_df.csv")