From e40862c1b5bff223a48460c5500aac212f71efde Mon Sep 17 00:00:00 2001
From: Jack Leary <jrleary@live.unc.edu>
Date: Mon, 16 Sep 2024 11:22:49 -0400
Subject: [PATCH 1/2] bumped version to 0.8.1

---
 DESCRIPTION              | 2 +-
 NEWS.md                  | 6 +++++-
 R/chooseCandidateGenes.R | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index bf4f313..41061c8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs
-Version: 0.8.0
+Version: 0.8.1
 Authors@R: c(person(given = "Jack", family = "Leary", email = "j.leary@ufl.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), 
              person(given = "Rhonda", family = "Bacher", email = "rbacher@ufl.edu", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
 Description: scLANE uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 
diff --git a/NEWS.md b/NEWS.md
index 4eab793..1c44a9d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,11 @@
+# Changes in version 0.8.1
+
++ Added a function called `chooseCandidateGenes()` to identify good genes for trajectory DE testing based on mean / SD expression and sparsity. 
+
 # Changes in version 0.8.0
 
 + Added implicit regularization of selected basis functions to the GLMM mode using a NB LASSO. 
-+ Switched candidate knot subsampling to a uniform sequence of candidate knots across pseudotime's support. 
++ Switched candidate knot subsampling to a uniform sequence of candidate1 knots across pseudotime's support. 
 
 # Changes in version 0.7.9
 
diff --git a/R/chooseCandidateGenes.R b/R/chooseCandidateGenes.R
index 45c2d02..1de5ce3 100644
--- a/R/chooseCandidateGenes.R
+++ b/R/chooseCandidateGenes.R
@@ -44,7 +44,7 @@ chooseCandidateGenes <- function(obj = NULL,
     grouped_stats <- purrr::map(seq(unique(id.vec)), \(i) {
       sub_matrix <- counts_matrix[, which(id.vec == unique(id.vec)[i])]
       gene_means <- Matrix::rowMeans(sub_matrix)
-      gene_sds <- sqrt(Matrix::rowMeans((sub_matrix - Matrix::rowMeans(sub_matrix))^2))
+      gene_sds <- sqrt(Matrix::rowMeans((sub_matrix - gene_means)^2))
       gene_sparsity <- Matrix::rowMeans(sub_matrix == 0)
       res <- data.frame(subject = unique(id.vec)[i],
                         gene = rownames(sub_matrix),
@@ -62,7 +62,7 @@ chooseCandidateGenes <- function(obj = NULL,
 
   } else {
     gene_means <- Matrix::rowMeans(counts_matrix)
-    gene_sds <- sqrt(Matrix::rowMeans((counts_matrix - Matrix::rowMeans(counts_matrix))^2))
+    gene_sds <- sqrt(Matrix::rowMeans((counts_matrix - gene_means)^2))
     gene_sparsity <- Matrix::rowMeans(counts_matrix == 0)
     gene_df <- data.frame(gene = rownames(counts_matrix),
                           mu = unname(gene_means),

From 021ae1ebd96ed7a4be97d70b0905ddade67cbcaf Mon Sep 17 00:00:00 2001
From: Jack Leary <j.leary@ufl.edu>
Date: Thu, 19 Sep 2024 20:43:20 -0400
Subject: [PATCH 2/2] added small-sample bias correction method to GEE wald
 test -- related to #197

---
 DESCRIPTION        |  2 +-
 R/testDynamic.R    |  4 +++-
 R/waldTestGEE.R    | 18 +++++++++++++++---
 man/waldTestGEE.Rd |  9 ++++++++-
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 41061c8..9904ad5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: scLANE uses truncated power basis spline models to build flexible,
              Downstream analysis functionalities include model comparison, dynamic gene clustering, smoothed counts generation, gene set enrichment testing, & visualization. 
 License: MIT + file LICENSE
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Depends: 
   glm2, 
   magrittr, 
diff --git a/R/testDynamic.R b/R/testDynamic.R
index 71f7849..ea149d9 100644
--- a/R/testDynamic.R
+++ b/R/testDynamic.R
@@ -331,7 +331,9 @@ testDynamic <- function(expr.mat = NULL,
 
      # compute test stat using asymptotic Chi-squared approximation
      if (is.gee) {
-       test_res <- waldTestGEE(mod.1 = marge_mod, mod.0 = null_mod)
+       test_res <- waldTestGEE(mod.1 = marge_mod, 
+                               mod.0 = null_mod, 
+                               bias.correct = ifelse(length(unique(id.vec)) < 30, TRUE, FALSE))
      } else {
        test_res <- modelLRT(mod.1 = marge_mod,
                             mod.0 = null_mod,
diff --git a/R/waldTestGEE.R b/R/waldTestGEE.R
index 33ab6cc..48bd17f 100644
--- a/R/waldTestGEE.R
+++ b/R/waldTestGEE.R
@@ -7,6 +7,7 @@
 #' @importFrom stats pchisq
 #' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.
 #' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.
+#' @param bias.correct Boolean specifying whether a small-sample bias correction should be applied to the estimated sandwich variance-covariance matrix. Useful when the number of subjects is small. Defaults to FALSE.
 #' @return A list containing the Wald test statistic, a \emph{p}-value, and the degrees of freedom used in the test.
 #' @details
 #' \itemize{
@@ -18,7 +19,10 @@
 #' @seealso \code{\link[geeM]{geem}}
 #' @seealso \code{\link{modelLRT}}
 
-waldTestGEE <- function(mod.1 = NULL, mod.0 = NULL) {
+waldTestGEE <- function(mod.1 = NULL,
+                        mod.0 = NULL, 
+                        bias.correct = FALSE, 
+                        correction.method = "df") {
   # check inputs
   if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
     res <- list(Wald_Stat = NA,
@@ -27,9 +31,10 @@ waldTestGEE <- function(mod.1 = NULL, mod.0 = NULL) {
                 Notes = "No test performed due to model failure.")
     return(res)
   }
-
+  correction.method <- tolower(correction.method)
+  if (!correction.method %in% c("df")) { stop("Unsupported bias correction method in waldTestGEE().") }
   mod.1 <- mod.1$final_mod
-  if (is.null(mod.1) || is.null(mod.0) || !(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to wald_test_gee().") }
+  if (is.null(mod.1) || is.null(mod.0) || !(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to waldTestGee().") }
   if (length(coef(mod.1)) <= length(coef(mod.0))) {
     # can't calculate Wald statistic if both models are intercept-only
     res <- list(Wald_Stat = 0,
@@ -47,6 +52,13 @@ waldTestGEE <- function(mod.1 = NULL, mod.0 = NULL) {
     }
     coef_vals <- as.matrix(coef(mod.1)[coef_idx])
     vcov_mat <- as.matrix(mod.1$var)[coef_idx, coef_idx]
+    if (bias.correct) {
+      if (correction.method == "df") {
+        n_s <- length(mod.1$clusz)
+        p <- ncol(mod.1$X) - 1
+        vcov_mat <- (n_s / (n_s - p)) * vcov_mat
+      }
+    }
     wald_test_stat <- try({
       as.numeric(crossprod(coef_vals, MASS::ginv(vcov_mat)) %*% coef_vals)
     }, silent = TRUE)
diff --git a/man/waldTestGEE.Rd b/man/waldTestGEE.Rd
index fdc8570..cf8e03b 100644
--- a/man/waldTestGEE.Rd
+++ b/man/waldTestGEE.Rd
@@ -4,12 +4,19 @@
 \alias{waldTestGEE}
 \title{Use a Wald test to compare nested GEE models.}
 \usage{
-waldTestGEE(mod.1 = NULL, mod.0 = NULL)
+waldTestGEE(
+  mod.1 = NULL,
+  mod.0 = NULL,
+  bias.correct = FALSE,
+  correction.method = "df"
+)
 }
 \arguments{
 \item{mod.1}{The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.}
 
 \item{mod.0}{The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.}
+
+\item{bias.correct}{Boolean specifying whether a small-sample bias correction should be applied to the estimated sandwich variance-covariance matrix. Useful when the number of subjects is small. Defaults to FALSE.}
 }
 \value{
 A list containing the Wald test statistic, a \emph{p}-value, and the degrees of freedom used in the test.