Skip to content

Commit ba21eb3

Browse files
committed
upgrade MAST templates
1 parent 0e43aeb commit ba21eb3

File tree

4 files changed

+129
-81
lines changed

4 files changed

+129
-81
lines changed

03_differential_expression/MAST_analysis.R

Lines changed: 65 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@ library(lme4)
1010
library(R.utils)
1111
library(glue)
1212
library(optparse)
13-
##### parameter parse #####
13+
library(qs)
14+
15+
################################################################################
16+
# parse parameters
17+
################################################################################
18+
1419
options(stringsAsFactors = F)
1520
option_list <- list(
1621
make_option("--seurat_obj", default = "https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds"),
@@ -27,11 +32,15 @@ system(glue("mkdir -p {outputDir}"))
2732

2833

2934
message("[Preparing inputs for MAST modeling]")
30-
##### Read in provided seurat #####
35+
36+
################################################################################
37+
# Read in provided seurat object
38+
################################################################################
39+
3140
if (isUrl(seurat_obj)) {
3241
seurat <- readRDS(url(seurat_obj))
3342
} else {
34-
seurat <- readRDS(seurat_obj)
43+
seurat <- qread(seurat_obj)
3544
}
3645

3746
message("Input seurat object: ", seurat_obj)
@@ -40,16 +49,22 @@ message("RNA is set as the default assay")
4049
message("Column name of clustering to use: ", resolution_column)
4150
Idents(object = seurat) <- resolution_column
4251
message("Subset original seurat to be only cluster ", cluster_name, " for faster computing!")
52+
53+
################################################################################
54+
# Prepare MAST inputs
55+
################################################################################
56+
4357
data_subset <- subset(x = seurat, idents = cluster_name)
44-
existintLayers <- Layers(data_subset[["RNA"]])
45-
if (existintLayers != "counts") {
46-
print("Not only counts excisted as layers in this object")
47-
print("Make sure your default slot is counts and it is raw counts")
48-
}
49-
##### Start from raw count for MAST #####
58+
59+
data_subset <- CreateSeuratObject(counts = data_subset[["RNA"]]$counts,
60+
project = 'subset',
61+
meta.data = data_subset@meta.data)
62+
63+
5064
message("Natural log of raw counts with pseudobulk 1 used for MAST modeling")
5165
sce <- as.SingleCellExperiment(data_subset)
52-
##### Log-Normalize Seurat for visualization later #####
66+
67+
# ##### Log-Normalize Seurat for visualization later #####
5368
message("Total counts normalization and log1p transformation done to raw counts")
5469
message("New layer Data added to the seurat object for visualization later!")
5570
data_subset <- NormalizeData(
@@ -59,10 +74,12 @@ data_subset <- NormalizeData(
5974
scale.factor = 10000,
6075
margin = 1
6176
)
62-
saveRDS(data_subset, glue("{outputDir}/processed_seurat.rds"))
63-
##### Continue MAST input prep #####
77+
qsave(data_subset, glue("{outputDir}/processed_seurat.qs"))
78+
79+
# log-normalize SCE
6480
assay(sce, "log") <- log(counts(sce) + 1)
65-
# Scaling ngenes
81+
82+
# scaling ngenes
6683
cdr <- colSums(assay(sce, "log") > 0)
6784
colData(sce)$cngeneson <- scale(cdr)
6885

@@ -86,63 +103,68 @@ SummarizedExperiment::colData(sca_filtered)$orig.ident <-
86103
SummarizedExperiment::colData(sca_filtered)[[column]] <-
87104
factor(SummarizedExperiment::colData(sca_filtered)[[column]])
88105

89-
##### MAST modeling #####
106+
################################################################################
107+
# MAST modeling
108+
################################################################################
109+
90110
message("[MAST modeling with supplied contrasts]")
91111

92112
message("Note: this step is time-consuming!")
93113

94-
comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[column]])[2]
95-
lrt_name <- paste0(column, comp_name)
96-
formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", column))
114+
# define model coefficients to examine
115+
SummarizedExperiment::colData(sca_filtered)[[contrast]] <- factor(SummarizedExperiment::colData(sca_filtered)[[contrast]])
116+
comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[contrast]])[2]
117+
lrt_names <- paste0(contrast, comp_name)
97118

119+
# fit model
120+
formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", contrast))
98121
zlmCond <- suppressMessages(MAST::zlm(formula_touse, sca_filtered,
99122
method = "glmer",
100123
ebayes = F, strictConvergence = FALSE
101124
))
125+
126+
# examine coefficients
102127
summaryCond_column <- suppressMessages(MAST::summary(zlmCond, doLRT = lrt_name))
103128

104-
##### MAST outputs #####
105129
message("[Main MAST computation done, result outputs]")
106130

107-
summary_cond_file <- paste0(outputDir, "/MAST_RESULTS_", cluster_name, "_", column, ".rds")
131+
summary_cond_file <- paste0(outputDir, "/MAST_RESULTS_", cluster_name, "_", contrast, ".rds")
108132
saveRDS(summaryCond_column, file = summary_cond_file)
109133

110134
message("Full MAST object saved to file ", summary_cond_file)
111135

136+
################################################################################
137+
# collect MAST outputs
138+
################################################################################
112139

113140
summaryDt_column <- summaryCond_column$datatable
114-
fcHurdle_column <- merge(
115-
summaryDt_column[
116-
contrast == lrt_name & component == "H",
117-
.(primerid, `Pr(>Chisq)`)
118-
],
119-
# This extracts hurdle p-values
120-
summaryDt_column[
121-
contrast == lrt_name & component == "logFC",
122-
.(primerid, coef, ci.hi, ci.lo)
123-
],
124-
# This extract LogFC data
125-
by = "primerid"
126-
)
141+
142+
fcHurdle_column <- summaryDt_column %>%
143+
filter(component %in% c('H', 'logFC'), contrast %in% lrt_names) %>%
144+
dplyr::select(-component) %>%
145+
pivot_longer(!primerid & !contrast, names_to = 'metric', values_to = 'value') %>%
146+
filter(!is.na(value), metric != 'z') %>%
147+
pivot_wider(names_from = 'metric', values_from = 'value') %>%
148+
mutate(cluster_name = cluster_name) %>%
149+
relocate(cluster_name)
150+
127151
fcHurdle_column <- stats::na.omit(as.data.frame(fcHurdle_column))
128-
fcHurdle_column$fdr <- p.adjust(fcHurdle_column$`Pr(>Chisq)`, "fdr")
129-
to_save_column <- fcHurdle_column
130152

131-
full_res_file <- paste0(outputDir, "/FULL_MAST_RESULTS_", cluster_name, "_", column, ".csv")
132-
write.table(to_save_column, file = full_res_file, row.names = FALSE, sep = ",")
153+
fcHurdle_column <- lapply(lrt_names, function(curr_contrast){
154+
fcHurdle_column %>% filter(contrast == curr_contrast) %>%
155+
mutate(fdr = p.adjust(`Pr(>Chisq)`, "fdr"))
156+
}) %>% bind_rows()
133157

134158

135-
message("MAST summary results output to csv files")
159+
full_res_file <- paste0(outputDir, "/FULL_MAST_RESULTS_", cluster_name, "_", contrast, ".csv")
160+
write.table(fcHurdle_column, file = full_res_file, row.names = FALSE, sep = ",")
136161

162+
message("MAST summary results output to csv files")
137163

138-
fcHurdleSig_column <- merge(fcHurdle_column[fcHurdle_column$fdr < .05, ],
139-
as.data.table(mcols(sca_filtered)),
140-
by = "primerid"
141-
)
142164

143-
setorder(fcHurdleSig_column, fdr)
165+
fcHurdleSig_column <- fcHurdle_column %>% filter(fdr < 0.05) %>% arrange(fdr)
144166

145-
sig_res_file <- paste0(outputDir, "/SIG_MAST_RESULTS_padj<0.05_", cluster_name, "_", column, ".csv")
167+
sig_res_file <- paste0(outputDir, "/SIG_MAST_RESULTS_padj_less_0.05_", cluster_name, "_", contrast, ".csv")
146168
write.table(fcHurdleSig_column, file = sig_res_file, row.names = FALSE, sep = ",")
147169

148170
message("Significant MAST summary results output to csv files")

03_differential_expression/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ The template aggregates single cell counts to the sample x cluster x factor_of_i
2121
- An adapted `seurat` Dotplot to show % of cells expressing top DEGs;
2222
- An integrated Violin-Box-Scatter (VBS) plot displaying the normalized expression of top DEGs per single cell across contrast groups.
2323

24-
We separately prepare the Rscript `02_differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, \~1000 cells and two group comparison, it took around 10-15 minutes.
24+
We separately prepare the Rscript `02_differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, \~1000 cells and two group comparison, it took around 10-15 minutes. If working on an HPC, you may be interested in running this as a batch job. A sample sbatch script is provided in `run_MAST_analysis.sh`.
2525

2626
To run this Rscript, you should input below parameters:
2727

@@ -39,7 +39,7 @@ To obtain more informative console logs from MAST analysis, please consider runn
3939

4040
You will expect to have three main outputs in your specified output folder:
4141

42-
- `processed_seurat.rds`: the processed seurat object containing log-normalized data
42+
- `processed_seurat.qs`: the processed seurat object containing log-normalized data
4343
- `MAST_RESULTS*`: the MAST modeling object to allow for further follow-up analysis of your own choice
4444
- `FULL_MAST_RESULTS_*`: full MAST differential expression analysis results in csv format
4545
- `SIG_MAST_RESULTS_padj<0.05*`: significant DEGs using 0.05 as the threshold for FDR
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
#SBATCH --job-name=NK_MAST # Job name
4+
#SBATCH --partition=short # Partition name
5+
#SBATCH --time=0-04:00 # Runtime in D-HH:MM format
6+
#SBATCH --nodes=1 # Number of nodes (keep at 1)
7+
#SBATCH --ntasks=1 # Number of tasks per node (keep at 1)
8+
#SBATCH --mem=16G # Memory needed per node (total)
9+
#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID
10+
#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID
11+
#SBATCH --mail-type=ALL # Type of email notification (BEGIN, END, FAIL, ALL)
12+
#SBATCH --array=1-2%2
13+
14+
module load gcc/9.2.0 imageMagick/7.1.0 geos/3.10.2 cmake/3.22.2 R/4.3.1 fftw/3.3.10 gdal/3.1.4 udunits/2.2.28 boost/1.75.0 python/3.9.14 hdf5/1.14.0
15+
16+
clusters=(2 3)
17+
cluster=${clusters[$SLURM_ARRAY_TASK_ID-1]}
18+
19+
Rscript MAST_analysis.R \
20+
--seurat_obj=https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds \
21+
--resolution_column=integrated_snn_res.0.4 \
22+
--cluster_name="${cluster}" \
23+
--contrast=age \
24+
--outputDir=out

0 commit comments

Comments
 (0)