Skip to content

Commit 58ff966

Browse files
authored
Merge pull request #15 from bcbio/feature_enhance_MAST_ALB
upgrade MAST templates
2 parents 0e43aeb + b8b24f6 commit 58ff966

File tree

4 files changed

+134
-81
lines changed

4 files changed

+134
-81
lines changed

03_differential_expression/MAST_analysis.R

Lines changed: 70 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@ library(lme4)
1010
library(R.utils)
1111
library(glue)
1212
library(optparse)
13-
##### parameter parse #####
13+
library(qs)
14+
15+
################################################################################
16+
# parse parameters
17+
################################################################################
18+
1419
options(stringsAsFactors = F)
1520
option_list <- list(
1621
make_option("--seurat_obj", default = "https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds"),
@@ -27,11 +32,15 @@ system(glue("mkdir -p {outputDir}"))
2732

2833

2934
message("[Preparing inputs for MAST modeling]")
30-
##### Read in provided seurat #####
35+
36+
################################################################################
37+
# Read in provided seurat object
38+
################################################################################
39+
3140
if (isUrl(seurat_obj)) {
3241
seurat <- readRDS(url(seurat_obj))
3342
} else {
34-
seurat <- readRDS(seurat_obj)
43+
seurat <- qread(seurat_obj)
3544
}
3645

3746
message("Input seurat object: ", seurat_obj)
@@ -40,16 +49,24 @@ message("RNA is set as the default assay")
4049
message("Column name of clustering to use: ", resolution_column)
4150
Idents(object = seurat) <- resolution_column
4251
message("Subset original seurat to be only cluster ", cluster_name, " for faster computing!")
52+
53+
################################################################################
54+
# Prepare MAST inputs
55+
################################################################################
56+
4357
data_subset <- subset(x = seurat, idents = cluster_name)
44-
existintLayers <- Layers(data_subset[["RNA"]])
45-
if (existintLayers != "counts") {
46-
print("Not only counts excisted as layers in this object")
47-
print("Make sure your default slot is counts and it is raw counts")
48-
}
49-
##### Start from raw count for MAST #####
58+
59+
data_subset <- CreateSeuratObject(
60+
counts = data_subset[["RNA"]]$counts,
61+
project = "subset",
62+
meta.data = data_subset@meta.data
63+
)
64+
65+
5066
message("Natural log of raw counts with pseudobulk 1 used for MAST modeling")
5167
sce <- as.SingleCellExperiment(data_subset)
52-
##### Log-Normalize Seurat for visualization later #####
68+
69+
# ##### Log-Normalize Seurat for visualization later #####
5370
message("Total counts normalization and log1p transformation done to raw counts")
5471
message("New layer Data added to the seurat object for visualization later!")
5572
data_subset <- NormalizeData(
@@ -59,10 +76,12 @@ data_subset <- NormalizeData(
5976
scale.factor = 10000,
6077
margin = 1
6178
)
62-
saveRDS(data_subset, glue("{outputDir}/processed_seurat.rds"))
63-
##### Continue MAST input prep #####
79+
qsave(data_subset, glue("{outputDir}/processed_seurat.qs"))
80+
81+
# log-normalize SCE
6482
assay(sce, "log") <- log(counts(sce) + 1)
65-
# Scaling ngenes
83+
84+
# scaling ngenes
6685
cdr <- colSums(assay(sce, "log") > 0)
6786
colData(sce)$cngeneson <- scale(cdr)
6887

@@ -86,63 +105,71 @@ SummarizedExperiment::colData(sca_filtered)$orig.ident <-
86105
SummarizedExperiment::colData(sca_filtered)[[column]] <-
87106
factor(SummarizedExperiment::colData(sca_filtered)[[column]])
88107

89-
##### MAST modeling #####
108+
################################################################################
109+
# MAST modeling
110+
################################################################################
111+
90112
message("[MAST modeling with supplied contrasts]")
91113

92114
message("Note: this step is time-consuming!")
93115

94-
comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[column]])[2]
95-
lrt_name <- paste0(column, comp_name)
96-
formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", column))
116+
# define model coefficients to examine
117+
SummarizedExperiment::colData(sca_filtered)[[contrast]] <- factor(SummarizedExperiment::colData(sca_filtered)[[contrast]])
118+
comp_name <- levels(SummarizedExperiment::colData(sca_filtered)[[contrast]])[2]
119+
lrt_names <- paste0(contrast, comp_name)
97120

121+
# fit model
122+
formula_touse <- as.formula(paste0("~ ngeneson + (1 | orig.ident) + ", contrast))
98123
zlmCond <- suppressMessages(MAST::zlm(formula_touse, sca_filtered,
99124
method = "glmer",
100125
ebayes = F, strictConvergence = FALSE
101126
))
127+
128+
# examine coefficients
102129
summaryCond_column <- suppressMessages(MAST::summary(zlmCond, doLRT = lrt_name))
103130

104-
##### MAST outputs #####
105131
message("[Main MAST computation done, result outputs]")
106132

107-
summary_cond_file <- paste0(outputDir, "/MAST_RESULTS_", cluster_name, "_", column, ".rds")
133+
summary_cond_file <- paste0(outputDir, "/MAST_RESULTS_", cluster_name, "_", contrast, ".rds")
108134
saveRDS(summaryCond_column, file = summary_cond_file)
109135

110136
message("Full MAST object saved to file ", summary_cond_file)
111137

138+
################################################################################
139+
# collect MAST outputs
140+
################################################################################
112141

113142
summaryDt_column <- summaryCond_column$datatable
114-
fcHurdle_column <- merge(
115-
summaryDt_column[
116-
contrast == lrt_name & component == "H",
117-
.(primerid, `Pr(>Chisq)`)
118-
],
119-
# This extracts hurdle p-values
120-
summaryDt_column[
121-
contrast == lrt_name & component == "logFC",
122-
.(primerid, coef, ci.hi, ci.lo)
123-
],
124-
# This extract LogFC data
125-
by = "primerid"
126-
)
143+
144+
fcHurdle_column <- summaryDt_column %>%
145+
filter(component %in% c("H", "logFC"), contrast %in% lrt_names) %>%
146+
dplyr::select(-component) %>%
147+
pivot_longer(!primerid & !contrast, names_to = "metric", values_to = "value") %>%
148+
filter(!is.na(value), metric != "z") %>%
149+
pivot_wider(names_from = "metric", values_from = "value") %>%
150+
mutate(cluster_name = cluster_name) %>%
151+
relocate(cluster_name)
152+
127153
fcHurdle_column <- stats::na.omit(as.data.frame(fcHurdle_column))
128-
fcHurdle_column$fdr <- p.adjust(fcHurdle_column$`Pr(>Chisq)`, "fdr")
129-
to_save_column <- fcHurdle_column
130154

131-
full_res_file <- paste0(outputDir, "/FULL_MAST_RESULTS_", cluster_name, "_", column, ".csv")
132-
write.table(to_save_column, file = full_res_file, row.names = FALSE, sep = ",")
155+
fcHurdle_column <- lapply(lrt_names, function(curr_contrast) {
156+
fcHurdle_column %>%
157+
filter(contrast == curr_contrast) %>%
158+
mutate(fdr = p.adjust(`Pr(>Chisq)`, "fdr"))
159+
}) %>% bind_rows()
133160

134161

135-
message("MAST summary results output to csv files")
162+
full_res_file <- paste0(outputDir, "/FULL_MAST_RESULTS_", cluster_name, "_", contrast, ".csv")
163+
write.table(fcHurdle_column, file = full_res_file, row.names = FALSE, sep = ",")
136164

165+
message("MAST summary results output to csv files")
137166

138-
fcHurdleSig_column <- merge(fcHurdle_column[fcHurdle_column$fdr < .05, ],
139-
as.data.table(mcols(sca_filtered)),
140-
by = "primerid"
141-
)
142167

143-
setorder(fcHurdleSig_column, fdr)
168+
fcHurdleSig_column <- fcHurdle_column %>%
169+
filter(fdr < 0.05) %>%
170+
arrange(fdr)
144171

145-
sig_res_file <- paste0(outputDir, "/SIG_MAST_RESULTS_padj<0.05_", cluster_name, "_", column, ".csv")
172+
sig_res_file <- paste0(outputDir, "/SIG_MAST_RESULTS_padj_less_0.05_", cluster_name, "_", contrast, ".csv")
146173
write.table(fcHurdleSig_column, file = sig_res_file, row.names = FALSE, sep = ",")
147174

148175
message("Significant MAST summary results output to csv files")

03_differential_expression/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ The template aggregates single cell counts to the sample x cluster x factor_of_i
2121
- An adapted `seurat` Dotplot to show % of cells expressing top DEGs;
2222
- An integrated Violin-Box-Scatter (VBS) plot displaying the normalized expression of top DEGs per single cell across contrast groups.
2323

24-
We separately prepare the Rscript `02_differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, \~1000 cells and two group comparison, it took around 10-15 minutes.
24+
We separately prepare the Rscript `02_differential_expression/MAST_analysis.R` to pre-compute the DEG results since MAST computation is relatively time-consuming. For our demo dataset, \~1000 cells and two group comparison, it took around 10-15 minutes. If working on an HPC, you may be interested in running this as a batch job. A sample sbatch script is provided in `run_MAST_analysis.sh`.
2525

2626
To run this Rscript, you should input below parameters:
2727

@@ -39,7 +39,7 @@ To obtain more informative console logs from MAST analysis, please consider runn
3939

4040
You will expect to have three main outputs in your specified output folder:
4141

42-
- `processed_seurat.rds`: the processed seurat object containing log-normalized data
42+
- `processed_seurat.qs`: the processed seurat object containing log-normalized data
4343
- `MAST_RESULTS*`: the MAST modeling object to allow for further follow-up analysis of your own choice
4444
- `FULL_MAST_RESULTS_*`: full MAST differential expression analysis results in csv format
4545
- `SIG_MAST_RESULTS_padj<0.05*`: significant DEGs using 0.05 as the threshold for FDR
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
#SBATCH --job-name=NK_MAST # Job name
4+
#SBATCH --partition=short # Partition name
5+
#SBATCH --time=0-04:00 # Runtime in D-HH:MM format
6+
#SBATCH --nodes=1 # Number of nodes (keep at 1)
7+
#SBATCH --ntasks=1 # Number of tasks per node (keep at 1)
8+
#SBATCH --mem=16G # Memory needed per node (total)
9+
#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID
10+
#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID
11+
#SBATCH --mail-type=ALL # Type of email notification (BEGIN, END, FAIL, ALL)
12+
#SBATCH --array=1-2%2
13+
14+
module load gcc/9.2.0 imageMagick/7.1.0 geos/3.10.2 cmake/3.22.2 R/4.3.1 fftw/3.3.10 gdal/3.1.4 udunits/2.2.28 boost/1.75.0 python/3.9.14 hdf5/1.14.0
15+
16+
clusters=(2 3)
17+
cluster=${clusters[$SLURM_ARRAY_TASK_ID-1]}
18+
19+
Rscript MAST_analysis.R \
20+
--seurat_obj=https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds \
21+
--resolution_column=integrated_snn_res.0.4 \
22+
--cluster_name="${cluster}" \
23+
--contrast=age \
24+
--outputDir=out

0 commit comments

Comments
 (0)