Skip to content


Merge pull request #82 from bcbio/dev_ruitong
Browse files Browse the repository at this point in the history
  • Loading branch information
lpantano authored Jan 31, 2025
2 parents 43ed340 + cb3681b commit e532e26
Show file tree
Hide file tree
Showing 3 changed files with 559 additions and 0 deletions.
31 changes: 31 additions & 0 deletions inst/templates/singlecell/
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,34 @@ Currently we are working on deploying a shiny app to inspect the single cell obj

`Integration/norm_integration.rmd` is a template with guidelines on how to work with multiple samples. It compares log2norm vs SCT, work with SCT by samples to remove batch biases better, provide options for integration between CCA and Harmony. As last step, it contains cell type clustering and visualization to help decide the best parameters.

# Differential expression

`differential_expression/scRNA_MAST.Rmd` is a template to visualize differentially expressed genes (DEG) results generated from MAST analysis. Main visualizations include:

- Group-level mean expression shown in heatmap;
- Volcano plots highlighting top DEGs;
- An adapted `seurat` Dotplot to show % of cells expressing top DEGs;
- An integrated Violin-Box-Scatter (VBS) plot displaying the normalized expression of top DEGs per single cell across contrast groups.

We separately prepare the Rscript `differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, ~1000 cells and two group comparison, it took around 10-15 minutes.

To run this Rscript, you should input below parameters:

- `--seurat_obj`: the seurat object containing your scRNA-seq data, raw counts stored in the layer `counts` of assay `RNA` is required
- `--resolution_column`: the column name of your choice of clustering method + resolution chosen, for example `pca_res0.4`, this is only required if you want to subset your data
- `--cluster_name`: the cluster you want to subset for MAST analysis
- `--contrast`: the column name in your `seurat` cell metadata indicating the group you want to do contrast for
- `--outputDir`: the output directory you will find your intermediate results for running `scRNA_MAST.Rmd` Default: `out`.

If none of the parameters are supplied, just run `Rscript differential_expression/MAST_analysis.R` will run it on our demo dataset.

To obtain more informative console logs from MAST analysis, please consider running:

`Rscript --no-save --no-restore --verbose differential_expression/MAST_analysis.R > MAST.Rout 2>&1`

You will expect to have three main outputs in your specified output folder:

- `processed_seurat.rds`: the processed seurat object containing log-normalized data
- `MAST_RESULTS*`: the MAST modeling object to allow for further follow-up analysis of your own choice
- `FULL_MAST_RESULTS_*`: full MAST differential expression analysis results in csv format
- `SIG_MAST_RESULTS_padj<0.05*`: significant DEGs using 0.05 as the threshold for FDR
137 changes: 137 additions & 0 deletions inst/templates/singlecell/differential_expression/MAST_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
##### library loading #####
##### parameter parse #####
options(stringsAsFactors = F)
option_list = list(make_option("--seurat_obj", default = ""),
make_option("--resolution_column", default = "integrated_snn_res.0.4"),
make_option("--cluster_name", default = "2"),
make_option("--contrast", default = "age"),
make_option("--outputDir", default = "out")
args = parse_args(OptionParser(option_list = option_list))

column <- contrast
system(glue("mkdir -p {outputDir}"))

message("[Preparing inputs for MAST modeling]")
##### Read in provided seurat #####
if (isUrl(seurat_obj)){
seurat <- readRDS(url(seurat_obj))
seurat <- readRDS(seurat_obj)

message("Input seurat object: ",seurat_obj)
DefaultAssay(seurat) <- "RNA"
message("RNA is set as the default assay")
message("Column name of clustering to use: ",resolution_column)
Idents(object = seurat) <- resolution_column
message("Subset original seurat to be only cluster ",cluster_name," for faster computing!")
data_subset <- subset(x = seurat, idents = cluster_name)
existintLayers <- Layers(data_subset[["RNA"]])
print("Not only counts excisted as layers in this object")
print("Make sure your default slot is counts and it is raw counts")
##### Start from raw count for MAST #####
message("Natural log of raw counts with pseudobulk 1 used for MAST modeling")
sce <- as.SingleCellExperiment(data_subset)
##### Log-Normalize Seurat for visualization later #####
message("Total counts normalization and log1p transformation done to raw counts")
message("New layer Data added to the seurat object for visualization later!")
data_subset <- NormalizeData(
assay = "RNA",
normalization.method = "LogNormalize",
scale.factor = 10000,
margin = 1)
##### Continue MAST input prep #####
assay(sce, "log") = log(counts(sce) + 1)
# Scaling ngenes
cdr = colSums(assay(sce, "log")>0)
colData(sce)$cngeneson = scale(cdr)

# Create new sce object (only 'log' count data)
sce.1 = SingleCellExperiment(assays = list(log = assay(sce, "log")))
colData(sce.1) = colData(sce)

#change to sca
sca = SceToSingleCellAssay(sce.1)

message("Subset genes observed in at least 10% of cells")

expressed_genes <- freq(sca) > 0.1
sca_filtered <- sca[expressed_genes, ]

cdr2 <- colSums(SummarizedExperiment::assay(sca_filtered)>0)

SummarizedExperiment::colData(sca_filtered)$ngeneson <- scale(cdr2)
SummarizedExperiment::colData(sca_filtered)$orig.ident <-
SummarizedExperiment::colData(sca_filtered)[[column]] <-

##### MAST modeling #####
message("[MAST modeling with supplied contrasts]")

message("Note: this step is time-consuming!")

comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[column]])[2]
lrt_name <- paste0(column, comp_name)
formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", column))

zlmCond <- suppressMessages(MAST::zlm(formula_touse, sca_filtered, method='glmer',
ebayes = F,strictConvergence = FALSE))
summaryCond_column <- suppressMessages(MAST::summary(zlmCond,doLRT=lrt_name))

##### MAST outputs #####
message("[Main MAST computation done, result outputs]")

summary_cond_file = paste0(outputDir,"/MAST_RESULTS_",cluster_name,"_", column, ".rds")
saveRDS(summaryCond_column, file = summary_cond_file)

message("Full MAST object saved to file ", summary_cond_file)

summaryDt_column <- summaryCond_column$datatable
fcHurdle_column <- merge(summaryDt_column[contrast == lrt_name & component == 'H',
.(primerid, `Pr(>Chisq)`)],
# This extracts hurdle p-values
summaryDt_column[contrast == lrt_name & component == 'logFC',
.(primerid, coef, ci.hi, ci.lo)],
# This extract LogFC data
by = 'primerid')
fcHurdle_column <- stats::na.omit(
fcHurdle_column$fdr <- p.adjust(fcHurdle_column$`Pr(>Chisq)`, 'fdr')
to_save_column <- fcHurdle_column

full_res_file = paste0(outputDir,"/FULL_MAST_RESULTS_",cluster_name,"_", column, ".csv")
write.table(to_save_column, file=full_res_file, row.names = FALSE, sep=",")

message("MAST summary results output to csv files")

fcHurdleSig_column <- merge(fcHurdle_column[fcHurdle_column$fdr < .05,],,
by = 'primerid')

setorder(fcHurdleSig_column, fdr)

sig_res_file = paste0(outputDir,"/SIG_MAST_RESULTS_padj<0.05_",cluster_name,"_", column, ".csv")
write.table(fcHurdleSig_column, file=sig_res_file, row.names = FALSE, sep=",")

message("Significant MAST summary results output to csv files")

0 comments on commit e532e26

Please sign in to comment.