Skip to content

Commit

Permalink
Merge pull request #230 from jr-leary7/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jr-leary7 authored Oct 2, 2024
2 parents ceb9d10 + d48cdea commit 79a5776
Show file tree
Hide file tree
Showing 13 changed files with 373 additions and 56 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scLANE
Type: Package
Title: Model Gene Expression Dynamics with Spline-Based NB GLMs, GEEs, & GLMMs
Version: 0.8.2
Version: 0.8.3
Authors@R: c(person(given = "Jack", family = "Leary", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")),
person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
Description: Our scLANE model uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time.
Expand Down
7 changes: 6 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# Changes in version 0.8.3

+ Sped up GLMM mode.
+ Switched GEE mode to use model-based variance and made computation of sandwich variance-covariance matrix conditional to speed things up and reduce memory usage.

# Changes in version 0.8.2

+ Sped up the NB LASSSO implementation in `fitGLMM()`.
+ Fixed some errors related to intercept-only `marge` models.
+ Added DF and KC corrections to new function `biasCorrectGEE()`, usage of which is set to `FALSE` by default in `testDynamic()`.
+ Added support for `monocle3` objects of class `CellDataSet` throughout the package, notably in `testDynamic()`.
+ Added support for `monocle3` objects of class `cell_data_set` throughout the package, notably in `testDynamic()`.

# Changes in version 0.8.1

Expand Down
10 changes: 8 additions & 2 deletions R/getResultsDE.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ getResultsDE <- function(test.dyn.res = NULL,
if (is.null(test.dyn.res)) { stop("Please provide a result list.") }
if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") }
# set up parallel processing
future::plan(future::multisession, workers = n.cores)
if (n.cores > 1L) {
future::plan(future::multisession, workers = n.cores)
} else {
future::plan(future::sequential)
}
# iterates first over genes, then over lineages per-gene & coerces to final data.frame after unlisting everything
result_df <- furrr::future_map_dfr(test.dyn.res,
function(x) {
Expand All @@ -49,6 +53,8 @@ getResultsDE <- function(test.dyn.res = NULL,
Test_Stat,
P_Val)
# shut down parallel processing
future::plan(future::sequential)
if (n.cores > 1L) {
future::plan(future::sequential)
}
return(result_df)
}
85 changes: 49 additions & 36 deletions R/marge2.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#' @param Y.offset (Optional) An vector of per-cell size factors to be included in the final model fit as an offset. Defaults to NULL.
#' @param M A set threshold for the maximum number of basis functions to be chosen. Defaults to 5.
#' @param is.gee Should the \code{geeM} package be used to fit a negative binomial GEE? Defaults to FALSE.
#' @param is.glmm Is the overall model to be fit a GLMM? Defaults to FALSE.
#' @param id.vec If \code{is.gee = TRUE}, must be a vector of ID values for the observations. Data must be sorted such that the subjects are in order! Defaults to NULL.
#' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
#' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
Expand Down Expand Up @@ -55,6 +56,7 @@ marge2 <- function(X_pred = NULL,
Y.offset = NULL,
M = 5,
is.gee = FALSE,
is.glmm = FALSE,
id.vec = NULL,
cor.structure = "ar1",
sandwich.var = FALSE,
Expand Down Expand Up @@ -817,47 +819,58 @@ marge2 <- function(X_pred = NULL,
model_formula <- paste(colnames(model_df), collapse = " + ")
model_formula <- paste0("Y ~ -1 + ", model_formula)
}
if (!is.null(Y.offset)) {
model_df <- dplyr::mutate(model_df,
cell_offset = Y.offset)
model_formula <- paste0(model_formula, " + ", "offset(log(1 / cell_offset))") # this is correct i promise
}
model_formula <- stats::as.formula(model_formula)
if (is.gee) {
final_mod <- geeM::geem(model_formula,
data = model_df,
id = id.vec,
family = MASS::negative.binomial(theta_hat, link = log),
corstr = cor.structure,
scale.fix = FALSE,
sandwich = sandwich.var)
} else {
final_mod <- MASS::glm.nb(model_formula,
if (!is.glmm) {
if (!is.null(Y.offset)) {
model_df <- dplyr::mutate(model_df,
cell_offset = Y.offset)
model_formula <- paste0(model_formula, " + ", "offset(log(1 / cell_offset))") # this is correct i promise
}
model_formula <- stats::as.formula(model_formula)
if (is.gee) {
final_mod <- geeM::geem(model_formula,
data = model_df,
method = "glm.fit2",
link = log,
init.theta = 1,
y = FALSE,
model = FALSE)
}
# format results
if (!is.gee) {
final_mod <- stripGLM(glm.obj = final_mod)
id = id.vec,
family = MASS::negative.binomial(theta_hat, link = log),
corstr = cor.structure,
scale.fix = FALSE,
sandwich = sandwich.var)
} else {
final_mod <- MASS::glm.nb(model_formula,
data = model_df,
method = "glm.fit2",
link = log,
init.theta = 1,
y = FALSE,
model = FALSE)
}
# format results
if (!is.gee) {
final_mod <- stripGLM(glm.obj = final_mod)
}
res <- list(final_mod = final_mod,
basis_mtx = NULL,
WIC_mtx = NULL,
GCV = NULL,
model_type = ifelse(is.gee, "GEE", "GLM"),
coef_names = names(stats::coef(final_mod)),
marge_coef_names = colnames(B_final))
if (return.GCV) {
df1a <- ncol(B_final) + pen * (ncol(B_final) - 1) / 2 # This matches the {earth} package, SAS and Friedman (1991) penalty
res$GCV <- sum((Y - stats::fitted(final_mod))^2) / (NN * (1 - df1a / NN)^2)
}
} else {
res <- list(final_mod = NULL,
basis_mtx = NULL,
WIC_mtx = NULL,
GCV = NULL,
model_type = ifelse(is.gee, "GEE", ifelse(is.glmm, "GLMM", "GLM")),
coef_names = NULL,
marge_coef_names = colnames(B_final))
}
res <- list(final_mod = final_mod,
basis_mtx = NULL,
WIC_mtx = NULL,
GCV = NULL,
model_type = ifelse(is.gee, "GEE", "GLM"),
coef_names = names(stats::coef(final_mod)),
marge_coef_names = colnames(B_final))

if (return.basis) {
res$basis_mtx <- model_df
}
if (return.GCV) {
df1a <- ncol(B_final) + pen * (ncol(B_final) - 1) / 2 # This matches the {earth} package, SAS and Friedman (1991) penalty
res$GCV <- sum((Y - stats::fitted(final_mod))^2) / (NN * (1 - df1a / NN)^2)
}
if (return.WIC) {
res$WIC_mtx <- wic_mat_2
}
Expand Down
102 changes: 102 additions & 0 deletions R/pullMARGESummary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#' Generate a summary of the MARGE model.
#'
#' @name pullMARGESummary
#' @author Jack R. Leary
#' @author Rhonda Bacher
#' @description This function takes in the \code{MARGE} model fitted during the running of \code{\link{marge2}} and summarizes it.
#' @import magrittr
#' @importFrom stats predict logLik deviance
#' @importFrom broom.mixed tidy
#' @importFrom dplyr rename
#' @param marge.model The \code{MARGE} model from \code{\link{marge2}}. Defaults to NULL.
#' @param is.gee Boolean specifying whether GEE mode was used in fitting the null model. Defaults to FALSE.
#' @param sandwich.var Boolean specifying whether the robust sandwich variance-covariance matrix should be used. Defaults to FALSE.
#' @param is.glmm Boolean specifying whether the GLMM mode was used in fitting the model. Defaults to FALSE.
#' @return A list containing a coefficient summary, fitted values and their standard errors, and the log-likelihood and deviance of the model.
#' @seealso \code{\link{marge2}}

pullMARGESummary <- function(marge.model = NULL,
is.gee = FALSE,
sandwich.var = FALSE,
is.glmm = FALSE) {
# check inputs
if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary") }
# handle the degenerate case
if (inherits(marge.model, "try-error")) {
res <- list(marge_pred_df = NULL,
marge_sumy_df = NULL,
ll_marge = NA_real_,
marge_fit_notes = marge.model[1])
} else {
# pull model summary
if (is.gee) {
if (sandwich.var) {
vcov_mat <- as.matrix(marge.model$final_mod$var)
} else {
vcov_mat <- as.matrix(marge.model$final_mod$naiv.var)
}
marge_pred_df <- try({
data.frame(marge_link_fit = predict(marge.model$final_mod),
marge_link_se = sqrt(apply(tcrossprod(marge.model$final_mod$X, vcov_mat) * marge.model$final_mod$X, 1, sum)))
}, silent = TRUE)
marge_sumy_df <- try({
marge_gee_summary <- summary(marge.model$final_mod)
data.frame(term = marge_gee_summary$coefnames,
estimate = unname(marge_gee_summary$beta),
`std.error` = ifelse(sandwich.var, unname(marge_gee_summary$se.robust), unname(marge_gee_summary$se.model)),
statistic = unname(marge_gee_summary$wald.test),
`p.value` = unname(marge_gee_summary$p))
}, silent = TRUE)

} else if (is.glmm) {
marge_pred_df <- try({
data.frame(predict(marge.model$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
}, silent = TRUE)
marge_sumy_df <- try({
marge_glmm_summary <- broom.mixed::tidy(marge.model$final_mod, effects = "fixed")
data.frame(term = marge.model$marge_coef_names,
estimate = marge_glmm_summary$estimate,
`std.error` = marge_glmm_summary$std.error,
statistic = marge_glmm_summary$statistic,
`p.value` = marge_glmm_summary$p.value)
}, silent = TRUE)

} else {
marge_pred_df <- try({
data.frame(stats::predict(marge.model$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
}, silent = TRUE)
marge_sumy_df <- try({
as.data.frame(broom.mixed::tidy(marge.model$final_mod)) %>%
lapply(unname) %>%
as.data.frame()
}, silent = TRUE)
}

# get log-likelihood for GLMM or GLM cases
if (is.glmm) {
ll_marge <- -marge.model$final_mod$fit$objective
} else if (is.gee) {
ll_marge <- NA_real_
} else {
ll_marge <- as.numeric(stats::logLik(marge.model$final_mod))
}
# check positive-definiteness of Hessian for GLMM -- might have an effect on LRT stat / accompanying p-value
if (is.glmm) {
if (!marge.model$final_mod$sdr$pdHess) {
marge_fit_notes <- "Non-positive definite Hessian in GLMM, probably due to shallow log-likelihood. Be careful!"
} else {
marge_fit_notes <- NA_character_
}
} else {
marge_fit_notes <- NA_character_
}
res <- list(marge_pred_df = marge_pred_df,
marge_sumy_df = marge_sumy_df,
ll_marge = ll_marge,
marge_dev = ifelse(is.gee, NA_real_, stats::deviance(marge.model$final_mod)),
marge_fit_notes = marge_fit_notes)
}
return(res)
}
79 changes: 79 additions & 0 deletions R/pullNullSummary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#' Generate a summary of the null model.
#'
#' @name pullNullSummary
#' @author Jack R. Leary
#' @author Rhonda Bacher
#' @description This function takes in the null model fitted during the running of \code{\link{marge2}} and summarizes it.
#' @import magrittr
#' @importFrom stats predict logLik deviance
#' @importFrom broom.mixed tidy
#' @importFrom dplyr rename
#' @param null.model The null model from \code{\link{marge2}}. Defaults to NULL.
#' @param is.gee Boolean specifying whether GEE mode was used in fitting the null model. Defaults to FALSE.
#' @param sandwich.var Boolean specifying whether the robust sandwich variance-covariance matrix should be used. Defaults to FALSE.
#' @param is.glmm Boolean specifying whether the GLMM mode was used in fitting the model. Defaults to FALSE.
#' @return A list containing a coefficient summary, fitted values and their standard errors, and the log-likelihood and deviance of the model.
#' @seealso \code{\link{marge2}}

pullNullSummary <- function(null.model = NULL,
is.gee = FALSE,
sandwich.var = FALSE,
is.glmm = FALSE) {
# check inputs
if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary") }
# handle the degenerate case
if (inherits(null.model, "try-error")) {
res <- list(null_sumy_df = NULL,
null_pred_df = NULL,
null_ll = NA_real_,
null_dev = NA_real_,
null_fit_notes = null.model[1])
} else {
# pull model summary
if (is.gee) {
null_sumy_df <- try({
null_gee_summary <- summary(null.model)
data.frame(term = null_gee_summary$coefnames,
estimate = unname(null_gee_summary$beta),
`std.error` = ifelse(sandwich.var, unname(null_gee_summary$se.robust), unname(null_gee_summary$se.model)),
statistic = unname(null_gee_summary$wald.test),
`p.value` = unname(null_gee_summary$p))
}, silent = TRUE)
null_pred_df <- try({
if (sandwich.var) {
vcov_mat <- as.matrix(null.model$var)
} else {
vcov_mat <- as.matrix(null.model$naiv.var)
}
data.frame(null_link_fit = stats::predict(null.model),
null_link_se = as.numeric(sqrt(tcrossprod(null.model$X, vcov_mat))))
}, silent = TRUE)
} else if (is.glmm) {
null_sumy_df <- try({
null_glmm_summary <- as.data.frame(broom.mixed::tidy(null.model, effects = "fixed"))
data.frame(term = null_glmm_summary$term,
estimate = null_glmm_summary$estimate[1],
`std.error` = null_glmm_summary$std.error[1],
statistic = null_glmm_summary$statistic[1],
`p.value` = null_glmm_summary$p.value[1])
}, silent = TRUE)
null_pred_df <- try({
data.frame(stats::predict(null.model, type = "link", se.fit = TRUE)[seq(2)]) %>%
dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
}, silent = TRUE)
} else {
null_sumy_df <- try({
as.data.frame(broom.mixed::tidy(null.model)) # saves a few bytes by converting from tibble
}, silent = TRUE)
null_pred_df <- try({
data.frame(stats::predict(null.model, type = "link", se.fit = TRUE)[seq(2)]) %>%
dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
}, silent = TRUE)
}
res <- list(null_sumy_df = null_sumy_df,
null_pred_df = null_pred_df,
null_ll = ifelse(is.gee, NA_real_, as.numeric(stats::logLik(null.model))),
null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)))
}
return(res)
}
15 changes: 11 additions & 4 deletions R/testDynamic.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#' \itemize{
#' \item If \code{expr.mat} is a \code{Seurat} object, counts will be extracted from the output of \code{\link[SeuratObject]{DefaultAssay}}. If using this functionality, check to ensure the specified assay is correct before running the function. If the input is a \code{SingleCellExperiment} or \code{CellDataSet} object, the raw counts will be extracted with \code{\link[BiocGenerics]{counts}}.
#' \item If using the GEE or GLMM model architectures, ensure that the observations are sorted by subject ID (this is assumed by the underlying fit implementations). If they are not, the models will error out.
#' \item If \code{gee.bias.correction.method} is set to "kc" or "df", a bias adjustment will be used to inflate the robust variance-covariance matrix prior to estimating the Wald test statistic. This is useful when the number of subjects is small and / or the number of per-subject observations is very large. Doing so will remove the bias in the sandwhich estimator in small-sample cases. Currently, we suggest keeping this NULL and using the model-based variance estimates and specifying the "ar1" correlation structure.
#' \item If \code{gee.bias.correction.method} is set to "kc" or "df", a bias adjustment will be used to inflate the robust variance-covariance matrix prior to estimating the Wald test statistic. This is useful when the number of subjects is small and / or the number of per-subject observations is very large. Doing so will remove the bias in the sandwich estimator in small-sample cases. Currently, we suggest keeping this NULL and using the model-based variance estimates and specifying the "ar1" correlation structure.
#' }
#' @return A list of lists, where each element is a gene and each gene contains sublists for each element. Each gene-lineage sublist contains a gene name, lineage number, default \code{marge} vs. null model test results, model statistics, and fitted values. Use \code{\link{getResultsDE}} to tidy the results.
#' @seealso \code{\link{getResultsDE}}
Expand Down Expand Up @@ -154,11 +154,13 @@ testDynamic <- function(expr.mat = NULL,
ls()[-which(ls() %in% necessary_vars)])
}
no_export <- unique(no_export)
package_list <- c("glm2", "scLANE", "MASS", "bigstatsr", "broom.mixed", "dplyr", "stats")
package_list <- c("scLANE", "MASS", "bigstatsr", "broom.mixed", "dplyr", "stats")
if (is.gee) {
package_list <- c(package_list, "geeM")
} else if (is.glmm) {
package_list <- c(package_list, "glmmTMB")
} else {
package_list <- c(package_list, "glm2")
}

# build models per-lineage per-gene, parallelize over genes
Expand Down Expand Up @@ -283,8 +285,13 @@ testDynamic <- function(expr.mat = NULL,
}

# summarize hinge function coefficients
null_sumy <- pull.null.sumy(null_mod, is.gee, is.glmm)
marge_sumy <- pull.marge.sumy(marge_mod, is.gee, is.glmm)
null_sumy <- pullNullSummary(null_mod,
is.gee = is.gee,
is.glmm = is.glmm)
marge_sumy <- pullMARGESummary(marge_mod,
is.gee = is.gee,
sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE),
is.glmm = is.glmm)

# perform slope test
marge_slope_df <- createSlopeTestData(marge_mod,
Expand Down
Loading

0 comments on commit 79a5776

Please sign in to comment.