Skip to contents

The dataset used here for benchmarking can be found on this GitHub page. It can also be directly downloaded here.

First, we define a function for loading the PBMC 68k dataset.

load_pbmc_68k_dataset <- function() {
  pbmc_68k_obj <- readRDS("pbmc68k_data.rds")
  
  mat <- pbmc_68k_obj$all_data$`17820`$hg19$mat
  genes <- pbmc_68k_obj$all_data$`17820`$hg19$gene_symbols
  barcodes <- pbmc_68k_obj$all_data$`17820`$hg19$barcodes
  
  rownames(mat) <- barcodes
  #colnames(mat) <- genes #make.names(rownames(genes, unique = TRUE)
  colnames(mat) <- make.names(rownames(genes), unique = TRUE)
  
  pbmc <- CreateSeuratObject(counts = t(mat))
  
  # clean up large objects that we don't need anymore
  rm(pbmc_68k_obj)
  rm(mat)
  rm(genes)
  rm(barcodes)
  gc()
  
  return(pbmc)
}

We set the sizes of subsets to analyze and load the full PBMC 68k dataset.

cell_sizes <- c(1000, 2000, 5000, seq(10^4, 6*10^4, 10^4), 68579)
times <- c()

cores <- 16

# Load the PBMC dataset
big_dataset <- load_pbmc_68k_dataset()

timing_df <- data.frame()

Next, we loop over each subset size and cluster the data using callback, sc-SHC, and CHOIR.

i <- 0
for (num_cells in cell_sizes) {
  i <- i + 1
  
  print("Processing num cells:")
  print(num_cells)
  
  print("Loading subset")
  seurat_obj <- subset(big_dataset, cells = sample(Cells(big_dataset), num_cells))

  p <- 1000
  seurat_obj <- seurat_workflow(seurat_obj, num_variable_features = p, algorithm="louvain", resolution_param = 0.8)

  print("Running callback")
  callback_start_time <- Sys.time()
  
  callback_memory <- peakRAM::peakRAM({
  FindClustersCallback(seurat_obj, cores=cores, dims = 1:num_PCs)
  callback_end_time <- Sys.time()
  })$Peak_RAM_Used_MiB


  print("Running scSHC")

  scSHC_start_time <- Sys.time()
  
  scSHC_memory <- peakRAM::peakRAM({
  scSHC(GetAssayData(seurat_obj, assay = "RNA", layer = "counts")[Seurat::VariableFeatures(seurat_obj),],
                          num_features = 1000,
                          num_PCs = 10,
                          cores = cores)
  })$Peak_RAM_Used_MiB

  scSHC_end_time <- Sys.time()


  print("Running CHOIR")

  CHOIR_start_time <- Sys.time()
  
  CHOIR_memory <- peakRAM::peakRAM({
  CHOIR(seurat_obj, 
                  n_cores = cores,
                  reduction = seurat_obj@reductions$pca@cell.embeddings[, 1:10],
                  var_features = VariableFeatures(seurat_obj),
                  use_assay = "RNA")
  })$Peak_RAM_Used_MiB

  
  CHOIR_end_time <- Sys.time()


  callback_time_taken <- difftime(callback_end_time, callback_start_time, units="mins")
  scSHC_time_taken <- difftime(scSHC_end_time, scSHC_start_time, units="mins")
  CHOIR_time_taken <- difftime(CHOIR_end_time, CHOIR_start_time, units="mins")
  

  num_cells <- c(num_cells, num_cells, num_cells)
  method <- c("callback", "sc-SHC", "CHOIR")
  time <- c(callback_time_taken, scSHC_time_taken, CHOIR_time_taken)
  memory <- c(callback_memory, scSHC_memory, CHOIR_memory)

  timing_df_new_row <- data.frame(num_cells, method, time, memory)
  
  print(timing_df_new_row)

  timing_df <- rbind(timing_df, timing_df_new_row)
}

Finally, we save the timing and peak memory use results to a csv file.

write.csv(timing_df, "pbmc_timing_df1.csv")