Merge pull request #230 from jr-leary7/dev

Dev
jr-leary7 · Oct 2, 2024 · 79a5776 · 79a5776
2 parents ceb9d10 + d48cdea
commit 79a5776
Show file tree

Hide file tree

Showing 13 changed files with 373 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model Gene Expression Dynamics with Spline-Based NB GLMs, GEEs, & GLMMs
-Version: 0.8.2
+Version: 0.8.3
 Authors@R: c(person(given = "Jack", family = "Leary", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), 
              person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
 Description: Our scLANE model uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,14 @@
+# Changes in version 0.8.3
+
++ Sped up GLMM mode. 
++ Switched GEE mode to use model-based variance and made computation of sandwich variance-covariance matrix conditional to speed things up and reduce memory usage. 
+
 # Changes in version 0.8.2
 
 + Sped up the NB LASSSO implementation in `fitGLMM()`. 
 + Fixed some errors related to intercept-only `marge` models. 
 + Added DF and KC corrections to new function `biasCorrectGEE()`, usage of which is set to `FALSE` by default in `testDynamic()`. 
-+ Added support for `monocle3` objects of class `CellDataSet` throughout the package, notably in `testDynamic()`. 
++ Added support for `monocle3` objects of class `cell_data_set` throughout the package, notably in `testDynamic()`. 
 
 # Changes in version 0.8.1
 

diff --git a/R/getResultsDE.R b/R/getResultsDE.R
@@ -28,7 +28,11 @@ getResultsDE <- function(test.dyn.res = NULL,
   if (is.null(test.dyn.res)) { stop("Please provide a result list.") }
   if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") }
   # set up parallel processing
-  future::plan(future::multisession, workers = n.cores)
+  if (n.cores > 1L) {
+    future::plan(future::multisession, workers = n.cores)
+  } else {
+    future::plan(future::sequential)
+  }
   # iterates first over genes, then over lineages per-gene & coerces to final data.frame after unlisting everything
   result_df <- furrr::future_map_dfr(test.dyn.res,
                                      function(x) {
@@ -49,6 +53,8 @@ getResultsDE <- function(test.dyn.res = NULL,
                                Test_Stat,
                                P_Val)
   # shut down parallel processing
-  future::plan(future::sequential)
+  if (n.cores > 1L) {
+    future::plan(future::sequential)
+  }
   return(result_df)
 }
diff --git a/R/marge2.R b/R/marge2.R
@@ -18,6 +18,7 @@
 #' @param Y.offset (Optional) An vector of per-cell size factors to be included in the final model fit as an offset. Defaults to NULL.
 #' @param M A set threshold for the maximum number of basis functions to be chosen. Defaults to 5.
 #' @param is.gee Should the \code{geeM} package be used to fit a negative binomial GEE? Defaults to FALSE.
+#' @param is.glmm Is the overall model to be fit a GLMM? Defaults to FALSE.
 #' @param id.vec If \code{is.gee = TRUE}, must be a vector of ID values for the observations. Data must be sorted such that the subjects are in order! Defaults to NULL.
 #' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
 #' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
@@ -55,6 +56,7 @@ marge2 <- function(X_pred = NULL,
                    Y.offset = NULL,
                    M = 5,
                    is.gee = FALSE,
+                   is.glmm = FALSE,
                    id.vec = NULL,
                    cor.structure = "ar1",
                    sandwich.var = FALSE,
@@ -817,47 +819,58 @@ marge2 <- function(X_pred = NULL,
     model_formula <- paste(colnames(model_df), collapse = " + ")
     model_formula <- paste0("Y ~ -1 + ", model_formula)
   }
-  if (!is.null(Y.offset)) {
-    model_df <- dplyr::mutate(model_df,
-                              cell_offset = Y.offset)
-    model_formula <- paste0(model_formula, " + ", "offset(log(1 / cell_offset))")  # this is correct i promise
-  }
-  model_formula <- stats::as.formula(model_formula)
-  if (is.gee) {
-    final_mod <- geeM::geem(model_formula,
-                            data = model_df,
-                            id = id.vec,
-                            family = MASS::negative.binomial(theta_hat, link = log),
-                            corstr = cor.structure,
-                            scale.fix = FALSE,
-                            sandwich = sandwich.var)
-  } else {
-    final_mod <- MASS::glm.nb(model_formula,
+  if (!is.glmm) {
+    if (!is.null(Y.offset)) {
+      model_df <- dplyr::mutate(model_df,
+                                cell_offset = Y.offset)
+      model_formula <- paste0(model_formula, " + ", "offset(log(1 / cell_offset))")  # this is correct i promise
+    }
+    model_formula <- stats::as.formula(model_formula)
+    if (is.gee) {
+      final_mod <- geeM::geem(model_formula,
                               data = model_df,
-                              method = "glm.fit2",
-                              link = log,
-                              init.theta = 1,
-                              y = FALSE,
-                              model = FALSE)
-  }
-  # format results
-  if (!is.gee) {
-    final_mod <- stripGLM(glm.obj = final_mod)
+                              id = id.vec,
+                              family = MASS::negative.binomial(theta_hat, link = log),
+                              corstr = cor.structure,
+                              scale.fix = FALSE,
+                              sandwich = sandwich.var)
+    } else {
+      final_mod <- MASS::glm.nb(model_formula,
+                                data = model_df,
+                                method = "glm.fit2",
+                                link = log,
+                                init.theta = 1,
+                                y = FALSE,
+                                model = FALSE)
+    }
+    # format results
+    if (!is.gee) {
+      final_mod <- stripGLM(glm.obj = final_mod)
+    }
+    res <- list(final_mod = final_mod,
+                basis_mtx = NULL,
+                WIC_mtx = NULL,
+                GCV = NULL,
+                model_type = ifelse(is.gee, "GEE", "GLM"),
+                coef_names = names(stats::coef(final_mod)),
+                marge_coef_names = colnames(B_final))
+    if (return.GCV) {
+      df1a <- ncol(B_final) + pen * (ncol(B_final) - 1) / 2  # This matches the {earth} package, SAS and Friedman (1991) penalty
+      res$GCV <- sum((Y - stats::fitted(final_mod))^2) / (NN * (1 - df1a / NN)^2)
+    }
+  } else {
+    res <- list(final_mod = NULL,
+                basis_mtx = NULL,
+                WIC_mtx = NULL,
+                GCV = NULL,
+                model_type = ifelse(is.gee, "GEE", ifelse(is.glmm, "GLMM", "GLM")),
+                coef_names = NULL,
+                marge_coef_names = colnames(B_final))
   }
-  res <- list(final_mod = final_mod,
-              basis_mtx = NULL,
-              WIC_mtx = NULL,
-              GCV = NULL,
-              model_type = ifelse(is.gee, "GEE", "GLM"),
-              coef_names = names(stats::coef(final_mod)),
-              marge_coef_names = colnames(B_final))
+
   if (return.basis) {
     res$basis_mtx <- model_df
   }
-  if (return.GCV) {
-    df1a <- ncol(B_final) + pen * (ncol(B_final) - 1) / 2  # This matches the {earth} package, SAS and Friedman (1991) penalty
-    res$GCV <- sum((Y - stats::fitted(final_mod))^2) / (NN * (1 - df1a / NN)^2)
-  }
   if (return.WIC) {
     res$WIC_mtx <- wic_mat_2
   }

diff --git a/R/pullMARGESummary.R b/R/pullMARGESummary.R
@@ -0,0 +1,102 @@
+#' Generate a summary of the MARGE model.
+#'
+#' @name pullMARGESummary
+#' @author Jack R. Leary
+#' @author Rhonda Bacher
+#' @description This function takes in the \code{MARGE} model fitted during the running of \code{\link{marge2}} and summarizes it.
+#' @import magrittr
+#' @importFrom stats predict logLik deviance
+#' @importFrom broom.mixed tidy
+#' @importFrom dplyr rename
+#' @param marge.model The \code{MARGE} model from \code{\link{marge2}}. Defaults to NULL.
+#' @param is.gee Boolean specifying whether GEE mode was used in fitting the null model. Defaults to FALSE.
+#' @param sandwich.var Boolean specifying whether the robust sandwich variance-covariance matrix should be used. Defaults to FALSE.
+#' @param is.glmm Boolean specifying whether the GLMM mode was used in fitting the model. Defaults to FALSE.
+#' @return A list containing a coefficient summary, fitted values and their standard errors, and the log-likelihood and deviance of the model.
+#' @seealso \code{\link{marge2}}
+
+pullMARGESummary <- function(marge.model = NULL,
+                             is.gee = FALSE,
+                             sandwich.var = FALSE,
+                             is.glmm = FALSE) {
+  # check inputs
+  if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary") }
+  # handle the degenerate case
+  if (inherits(marge.model, "try-error")) {
+    res <- list(marge_pred_df = NULL,
+                marge_sumy_df = NULL,
+                ll_marge = NA_real_,
+                marge_fit_notes = marge.model[1])
+  } else {
+    # pull model summary
+    if (is.gee) {
+      if (sandwich.var) {
+        vcov_mat <- as.matrix(marge.model$final_mod$var)
+      } else {
+        vcov_mat <- as.matrix(marge.model$final_mod$naiv.var)
+      }
+      marge_pred_df <- try({
+        data.frame(marge_link_fit = predict(marge.model$final_mod),
+                   marge_link_se = sqrt(apply(tcrossprod(marge.model$final_mod$X, vcov_mat) * marge.model$final_mod$X, 1, sum)))
+      }, silent = TRUE)
+      marge_sumy_df <- try({
+        marge_gee_summary <- summary(marge.model$final_mod)
+        data.frame(term = marge_gee_summary$coefnames,
+                   estimate = unname(marge_gee_summary$beta),
+                   `std.error` = ifelse(sandwich.var, unname(marge_gee_summary$se.robust), unname(marge_gee_summary$se.model)),
+                   statistic = unname(marge_gee_summary$wald.test),
+                   `p.value` = unname(marge_gee_summary$p))
+      }, silent = TRUE)
+
+    } else if (is.glmm) {
+      marge_pred_df <- try({
+        data.frame(predict(marge.model$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
+          dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
+      }, silent = TRUE)
+      marge_sumy_df <- try({
+        marge_glmm_summary <- broom.mixed::tidy(marge.model$final_mod, effects = "fixed")
+        data.frame(term = marge.model$marge_coef_names,
+                   estimate = marge_glmm_summary$estimate,
+                   `std.error` = marge_glmm_summary$std.error,
+                   statistic = marge_glmm_summary$statistic,
+                   `p.value` = marge_glmm_summary$p.value)
+      }, silent = TRUE)
+
+    } else {
+      marge_pred_df <- try({
+        data.frame(stats::predict(marge.model$final_mod, type = "link", se.fit = TRUE)[seq(2)]) %>%
+          dplyr::rename(marge_link_fit = fit, marge_link_se = se.fit)
+      }, silent = TRUE)
+      marge_sumy_df <- try({
+        as.data.frame(broom.mixed::tidy(marge.model$final_mod)) %>%
+          lapply(unname) %>%
+          as.data.frame()
+      }, silent = TRUE)
+    }
+
+    # get log-likelihood for GLMM or GLM cases
+    if (is.glmm) {
+      ll_marge <- -marge.model$final_mod$fit$objective
+    } else if (is.gee) {
+      ll_marge <- NA_real_
+    } else {
+      ll_marge <- as.numeric(stats::logLik(marge.model$final_mod))
+    }
+    # check positive-definiteness of Hessian for GLMM -- might have an effect on LRT stat / accompanying p-value
+    if (is.glmm) {
+      if (!marge.model$final_mod$sdr$pdHess) {
+        marge_fit_notes <- "Non-positive definite Hessian in GLMM, probably due to shallow log-likelihood. Be careful!"
+      } else {
+        marge_fit_notes <- NA_character_
+      }
+    } else {
+      marge_fit_notes <- NA_character_
+    }
+    res <- list(marge_pred_df = marge_pred_df,
+                marge_sumy_df = marge_sumy_df,
+                ll_marge = ll_marge,
+                marge_dev = ifelse(is.gee, NA_real_, stats::deviance(marge.model$final_mod)),
+                marge_fit_notes = marge_fit_notes)
+  }
+  return(res)
+}
diff --git a/R/pullNullSummary.R b/R/pullNullSummary.R
@@ -0,0 +1,79 @@
+#' Generate a summary of the null model.
+#'
+#' @name pullNullSummary
+#' @author Jack R. Leary
+#' @author Rhonda Bacher
+#' @description This function takes in the null model fitted during the running of \code{\link{marge2}} and summarizes it.
+#' @import magrittr
+#' @importFrom stats predict logLik deviance
+#' @importFrom broom.mixed tidy
+#' @importFrom dplyr rename
+#' @param null.model The null model from \code{\link{marge2}}. Defaults to NULL.
+#' @param is.gee Boolean specifying whether GEE mode was used in fitting the null model. Defaults to FALSE.
+#' @param sandwich.var Boolean specifying whether the robust sandwich variance-covariance matrix should be used. Defaults to FALSE.
+#' @param is.glmm Boolean specifying whether the GLMM mode was used in fitting the model. Defaults to FALSE.
+#' @return A list containing a coefficient summary, fitted values and their standard errors, and the log-likelihood and deviance of the model.
+#' @seealso \code{\link{marge2}}
+
+pullNullSummary <- function(null.model = NULL,
+                            is.gee = FALSE,
+                            sandwich.var = FALSE,
+                            is.glmm = FALSE) {
+  # check inputs
+  if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary") }
+  # handle the degenerate case
+  if (inherits(null.model, "try-error")) {
+    res <- list(null_sumy_df = NULL,
+                null_pred_df = NULL,
+                null_ll = NA_real_,
+                null_dev = NA_real_,
+                null_fit_notes = null.model[1])
+  } else {
+    # pull model summary
+    if (is.gee) {
+      null_sumy_df <- try({
+        null_gee_summary <- summary(null.model)
+        data.frame(term = null_gee_summary$coefnames,
+                   estimate = unname(null_gee_summary$beta),
+                   `std.error` = ifelse(sandwich.var, unname(null_gee_summary$se.robust), unname(null_gee_summary$se.model)),
+                   statistic = unname(null_gee_summary$wald.test),
+                   `p.value` = unname(null_gee_summary$p))
+      }, silent = TRUE)
+      null_pred_df <- try({
+        if (sandwich.var) {
+          vcov_mat <- as.matrix(null.model$var)
+        } else {
+          vcov_mat <- as.matrix(null.model$naiv.var)
+        }
+        data.frame(null_link_fit = stats::predict(null.model),
+                   null_link_se = as.numeric(sqrt(tcrossprod(null.model$X, vcov_mat))))
+      }, silent = TRUE)
+    } else if (is.glmm) {
+      null_sumy_df <- try({
+        null_glmm_summary <- as.data.frame(broom.mixed::tidy(null.model, effects = "fixed"))
+        data.frame(term = null_glmm_summary$term,
+                   estimate = null_glmm_summary$estimate[1],
+                   `std.error` = null_glmm_summary$std.error[1],
+                   statistic = null_glmm_summary$statistic[1],
+                   `p.value` = null_glmm_summary$p.value[1])
+      }, silent = TRUE)
+      null_pred_df <- try({
+        data.frame(stats::predict(null.model, type = "link", se.fit = TRUE)[seq(2)]) %>%
+          dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
+      }, silent = TRUE)
+    }  else {
+      null_sumy_df <- try({
+        as.data.frame(broom.mixed::tidy(null.model)) # saves a few bytes by converting from tibble
+      }, silent = TRUE)
+      null_pred_df <- try({
+        data.frame(stats::predict(null.model, type = "link", se.fit = TRUE)[seq(2)]) %>%
+          dplyr::rename(null_link_fit = fit, null_link_se = se.fit)
+      }, silent = TRUE)
+    }
+    res <- list(null_sumy_df = null_sumy_df,
+                null_pred_df = null_pred_df,
+                null_ll = ifelse(is.gee, NA_real_, as.numeric(stats::logLik(null.model))),
+                null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)))
+  }
+  return(res)
+}
diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -38,7 +38,7 @@
 #' \itemize{
 #' \item If \code{expr.mat} is a \code{Seurat} object, counts will be extracted from the output of \code{\link[SeuratObject]{DefaultAssay}}. If using this functionality, check to ensure the specified assay is correct before running the function. If the input is a \code{SingleCellExperiment} or \code{CellDataSet} object, the raw counts will be extracted with \code{\link[BiocGenerics]{counts}}.
 #' \item If using the GEE or GLMM model architectures, ensure that the observations are sorted by subject ID (this is assumed by the underlying fit implementations). If they are not, the models will error out.
-#' \item If \code{gee.bias.correction.method} is set to "kc" or "df", a bias adjustment will be used to inflate the robust variance-covariance matrix prior to estimating the Wald test statistic. This is useful when the number of subjects is small and / or the number of per-subject observations is very large. Doing so will remove the bias in the sandwhich estimator in small-sample cases. Currently, we suggest keeping this NULL and using the model-based variance estimates and specifying the "ar1" correlation structure.
+#' \item If \code{gee.bias.correction.method} is set to "kc" or "df", a bias adjustment will be used to inflate the robust variance-covariance matrix prior to estimating the Wald test statistic. This is useful when the number of subjects is small and / or the number of per-subject observations is very large. Doing so will remove the bias in the sandwich estimator in small-sample cases. Currently, we suggest keeping this NULL and using the model-based variance estimates and specifying the "ar1" correlation structure.
 #' }
 #' @return A list of lists, where each element is a gene and each gene contains sublists for each element. Each gene-lineage sublist contains a gene name, lineage number, default \code{marge} vs. null model test results, model statistics, and fitted values. Use \code{\link{getResultsDE}} to tidy the results.
 #' @seealso \code{\link{getResultsDE}}
@@ -154,11 +154,13 @@ testDynamic <- function(expr.mat = NULL,
                    ls()[-which(ls() %in% necessary_vars)])
   }
   no_export <- unique(no_export)
-  package_list <- c("glm2", "scLANE", "MASS",  "bigstatsr", "broom.mixed", "dplyr", "stats")
+  package_list <- c("scLANE", "MASS",  "bigstatsr", "broom.mixed", "dplyr", "stats")
   if (is.gee) {
     package_list <- c(package_list, "geeM")
   } else if (is.glmm) {
     package_list <- c(package_list, "glmmTMB")
+  } else {
+    package_list <- c(package_list, "glm2")
   }
 
   # build models per-lineage per-gene, parallelize over genes
@@ -283,8 +285,13 @@ testDynamic <- function(expr.mat = NULL,
       }
 
      # summarize hinge function coefficients
-     null_sumy <- pull.null.sumy(null_mod, is.gee, is.glmm)
-     marge_sumy <- pull.marge.sumy(marge_mod, is.gee, is.glmm)
+     null_sumy <- pullNullSummary(null_mod,
+                                  is.gee = is.gee,
+                                  is.glmm = is.glmm)
+     marge_sumy <- pullMARGESummary(marge_mod,
+                                    is.gee = is.gee,
+                                    sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE),
+                                    is.glmm = is.glmm)
 
      # perform slope test
      marge_slope_df <- createSlopeTestData(marge_mod,