Merge pull request #97 from jr-leary7/dev

Dev
jr-leary7 · Jun 19, 2023 · c73f4f6 · c73f4f6
2 parents c611b46 + f8e0893
commit c73f4f6
Show file tree

Hide file tree

Showing 40 changed files with 1,301 additions and 829 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,5 +2,6 @@
 .Rhistory
 .RData
 .Ruserdata
+.Rbuildignore
 inst/doc
 codecov.yml
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,62 +1,77 @@
 Package: scLANE
 Type: Package
-Title: Model gene expression over pseudotime with spline-based NB GLMs, GEEs, & GLMMs
-Version: 0.6.2
+Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs
+Version: 0.7.0
 Authors@R: c(person(given = "Jack", family = "Leary", email = "[email protected]", role = c("aut", "cre")), 
              person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd")))
-Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime / latent time. Currently supports negative binomial GLMs, GEEs, & GLMMs. 
+Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 
+             The modeling architectures currently supported are negative binomial GLMs, GEEs, & GLMMs. 
+             Downstream analysis functionalities include model comparison, dynamic gene clustering, smoothed counts generation, gene set enrichment testing, & visualization. 
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.2.1
 Depends: 
-  magrittr, 
-  glm2
+  glm2, 
+  magrittr
 Imports: 
-    dplyr,
-    foreach,
-    doParallel,
-    parallel,
+    geeM,
     MASS,
-    ggplot2,
+    dplyr,
     stats,
-    gamlss,
-    scales,
-    splines, 
-    bigstatsr,
-    broom,
+    utils, 
+    withr, 
     purrr,
     tidyr,
-    geeM,
-    glmmTMB, 
-    broom.mixed, 
-    tidyselect,
     furrr, 
+    gamlss,
+    scales,
     future, 
+    ggplot2,
+    splines, 
+    foreach,
+    glmmTMB, 
+    parallel,
+    bigstatsr,
     RcppEigen, 
+    doParallel,
+    tidyselect,
+    broom.mixed, 
     Rcpp (>= 1.0.7)
 URL: https://github.com/jr-leary7/scLANE
 BugReports: https://github.com/jr-leary7/scLANE/issues
 Suggests: 
-    rmarkdown,
-    knitr,
-    testthat (>= 3.0.0),
     covr, 
+    ggh4x, 
+    knitr,
+    irlba, 
+    rlang, 
+    igraph, 
+    Seurat, 
     bluster, 
     cluster, 
-    clusterProfiler, 
-    igraph, 
-    irlba, 
-    slingshot, 
     msigdbr, 
+    rmarkdown,
+    slingshot, 
+    BiocGenerics, 
     BiocNeighbors, 
-    rlang, 
-    Seurat, 
+    clusterProfiler, 
+    testthat (>= 3.0.0),
     SingleCellExperiment, 
-    SummarizedExperiment, 
-    BiocGenerics
+    SummarizedExperiment
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 LinkingTo: 
-    RcppEigen,
-    Rcpp
+    Rcpp, 
+    RcppEigen
+biocViews:
+    RNASeq,
+    Software,
+    TimeCourse,
+    Sequencing,
+    Regression,
+    SingleCell, 
+    Visualization, 
+    GeneExpression,
+    Transcriptomics,
+    DifferentialExpression
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(clusterGenes)
+export(createCellOffset)
 export(enrichDynamicGenes)
 export(extractBreakpoints)
 export(fitGLMM)
@@ -25,7 +26,6 @@ importFrom(MASS,negative.binomial)
 importFrom(MASS,theta.mm)
 importFrom(Rcpp,sourceCpp)
 importFrom(bigstatsr,as_FBM)
-importFrom(broom,tidy)
 importFrom(broom.mixed,tidy)
 importFrom(doParallel,registerDoParallel)
 importFrom(dplyr,across)
@@ -87,6 +87,8 @@ importFrom(parallel,stopCluster)
 importFrom(purrr,discard)
 importFrom(purrr,map)
 importFrom(purrr,map2)
+importFrom(purrr,map_chr)
+importFrom(purrr,map_dbl)
 importFrom(purrr,map_dfr)
 importFrom(purrr,pmap_dfc)
 importFrom(purrr,reduce)
@@ -104,6 +106,7 @@ importFrom(stats,hclust)
 importFrom(stats,kmeans)
 importFrom(stats,lm.fit)
 importFrom(stats,logLik)
+importFrom(stats,offset)
 importFrom(stats,p.adjust)
 importFrom(stats,p.adjust.methods)
 importFrom(stats,pchisq)
@@ -113,4 +116,6 @@ importFrom(stats,quantile)
 importFrom(stats,setNames)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyselect,everything)
+importFrom(utils,tail)
+importFrom(withr,with_output_sink)
 useDynLib(scLANE, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# scLANE 0.6.3
+
+* Added a `NEWS.md` file to track changes to the package.
diff --git a/R/clusterGenes.R b/R/clusterGenes.R
@@ -7,32 +7,44 @@
 #' @importFrom purrr map discard map2 reduce
 #' @importFrom stats setNames hclust cutree kmeans dist
 #' @param test.dyn.res The list returned by \code{\link{testDynamic}} - no extra processing required. Defaults to NULL.
+#' @param pt A data.frame containing the pseudotime or latent time estimates for each cell. Defaults to NULL.
+#' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
 #' @param clust.algo The clustering method to use. Can be one of "hclust", "kmeans", "leiden". Defaults to "leiden".
 #' @param use.pca Should PCA be performed prior to clustering? Defaults to FALSE.
 #' @param n.PC The number of principal components to use when performing dimension reduction prior to clustering. Defaults to 15.
 #' @param lineages Should one or more lineages be isolated? If so, specify which one(s). Otherwise, all lineages will be clustered independently. Defaults to NULL.
+#' @details
+#' \itemize{
+#' \item Due to some peculiarities of how the fitted values (on the link scale) are generated for \code{geeM} models, it's not necessary to multiply them by the offset as this is done internally. For GLM & GEE models, the opposite is true, and \code{size.factor.offset} must be provided in order to rescale the fitted values correctly.
+#' }
 #' @return A data.frame of with three columns: \code{Gene}, \code{Lineage}, and \code{Cluster}.
 #' @seealso \code{\link{testDynamic}}
 #' @seealso \code{\link{plotClusteredGenes}}
 #' @export
 #' @examples
 #' \dontrun{
-#' clusterGenes(gene_stats, clust.algo = "leiden")
+#' clusterGenes(gene_stats, pt = pt_df)
 #' clusterGenes(gene_stats,
+#'              pt = pt_df,
+#'              size.factor.offset = createCellOffset(sce_obj),
 #'              clust.algo = "kmeans",
 #'              use.pca = TRUE,
 #'              n.PC = 10,
 #'              lineages = "B")
-#' clusterGenes(gene_stats, lineages = c("A", "C"))
+#' clusterGenes(gene_stats,
+#'              pt = pt_df,
+#'              lineages = c("A", "C"))
 #' }
 
 clusterGenes <- function(test.dyn.res = NULL,
+                         pt = NULL,
+                         size.factor.offset = NULL,
                          clust.algo = "leiden",
                          use.pca = FALSE,
                          n.PC = 15,
                          lineages = NULL) {
   # check inputs
-  if (is.null(test.dyn.res)) { stop("test.dyn.res must be supplied to clusterGenes().") }
+  if (is.null(test.dyn.res) || is.null(pt)) { stop("test.dyn.res & pt must be supplied to clusterGenes().") }
   clust.algo <- tolower(clust.algo)
   if (!clust.algo %in% c("hclust", "kmeans", "leiden")) { stop("clust.algo must be one of 'hclust', 'kmeans', or 'leiden'.") }
   if ((use.pca & is.null(n.PC)) || (use.pca & n.PC <= 0)) { stop("n.PC must be a non-zero integer when clustering on principal components.") }
@@ -48,8 +60,13 @@ clusterGenes <- function(test.dyn.res = NULL,
                        purrr::discard(rlang::is_na) %>%
                        purrr::discard(\(p) rlang::inherits_only(p, "try-error")) %>%
                        purrr::map2(.y = names(.), function(x, y) {
-                         t(as.data.frame(exp(x$marge_link_fit))) %>%
-                           magrittr::set_rownames(y)
+                         if (is.null(size.factor.offset)) {
+                           t(as.data.frame(exp(x$marge_link_fit))) %>%
+                             magrittr::set_rownames(y)
+                         } else {
+                           t(as.data.frame(exp(x$marge_link_fit)) * unname(size.factor.offset)[!is.na(pt[, l])]) %>%
+                             magrittr::set_rownames(y)
+                         }
                        }) %>%
                        purrr::reduce(rbind)
     if (use.pca) {
@@ -105,12 +122,12 @@ clusterGenes <- function(test.dyn.res = NULL,
       if (use.pca) {
         clust_res <- stats::kmeans(fitted_vals_pca$x,
                                    centers = k_to_use,
-                                   nstart = 5,
+                                   nstart = 10,
                                    algorithm = "Hartigan-Wong")
       } else {
         clust_res <- stats::kmeans(fitted_vals_mat,
                                    centers = k_to_use,
-                                   nstart = 5,
+                                   nstart = 10,
                                    algorithm = "Hartigan-Wong")
       }
       gene_clusters <- data.frame(Gene = rownames(fitted_vals_mat),

diff --git a/R/createCellOffset.R b/R/createCellOffset.R
@@ -0,0 +1,36 @@
+#' Create an offset vector before modeling
+#'
+#' @name createCellOffset
+#' @author Jack Leary
+#' @description Creates a vector of per-cell size factors to be used as input to \code{\link{testDynamic}} as a model offset given a variety of inputs.
+#' @param expr.mat Either a gene-by-cell (cells as columns) matrix of raw integer counts prior to any cell filtering, a \code{Seurat} object, or a \code{SingleCellExperiment} object. Defaults to NULL.
+#' @param scale.factor The scaling factor use to multiply the sequencing depth factor for each cell. The default value is 1e4, which returns counts-per-10k.
+#' @return A named numeric vector containing the computed size factor for each cell.
+#' @seealso \code{\link{testDynamic}}
+#' @seealso \code{\link{marge2}}
+#' @seealso \code{\link[Seurat]{LogNormalize}}
+#' @seealso \code{\link[scuttle]{computeLibraryFactors}}
+#' @export
+#' @examples
+#' \dontrun{
+#' createCellOffset(expr.mat = raw_counts)
+#' createCellOffset(expr.mat = raw_counts, scale.factor = 1e5)
+#' }
+
+createCellOffset <- function(expr.mat = NULL, scale.factor = 1e4) {
+  # check inputs
+  if (is.null(expr.mat)) { stop("Please provide expr.mat to createCellOffset().") }
+  if (inherits(expr.mat, "SingleCellExperiment")) {
+    expr.mat <- as.matrix(BiocGenerics::counts(expr.mat))
+  } else if (inherits(expr.mat, "Seurat")) {
+    expr.mat <- as.matrix(Seurat::GetAssayData(expr.mat,
+                                               slot = "counts",
+                                               assay = Seurat::DefaultAssay(expr.mat)))
+  }
+  # compute per-cell size factors
+  cell_names <- colnames(expr.mat)
+  seq_depths <- colSums(expr.mat)
+  lib_size_factors <- scale.factor / seq_depths
+  names(lib_size_factors) <- cell_names
+  return(lib_size_factors)
+}
diff --git a/R/createSlopeTestData.R b/R/createSlopeTestData.R
@@ -3,6 +3,7 @@
 #' @name createSlopeTestData
 #' @author Jack Leary
 #' @description Creates a data.frame of \code{marge} model breakpoints, \emph{p}-values, and other info.
+#' @importFrom purrr map_dbl
 #' @param marge.model A \code{marge} model object, like those returned from \code{\link{marge2}}. Defaults to NULL.
 #' @param pt A data.frame containing pseudotime or latent time values. Defaults to NULL.
 #' @param is.gee Was the GEE framework used? Defaults to FALSE.
@@ -33,7 +34,7 @@ createSlopeTestData <- function(marge.model = NULL,
     p_vals <- NA_real_
     mod_notes <- "MARGE model error"
   } else {
-    if (!is.glmm && length(coef(marge.model$final_mod)) == 1) {
+    if (!is.glmm && length(marge.model$marge_coef_names) == 1) {
       rounded_brkpts <- NA_real_
       brkpts <- NA_real_
       brkpt_dirs <- NA_character_
@@ -44,7 +45,7 @@ createSlopeTestData <- function(marge.model = NULL,
       model_breakpoints_rounded <- extractBreakpoints(marge.model, directions = TRUE)
       rounded_brkpts <- model_breakpoints_rounded$Breakpoint
       brkpt_dirs <- model_breakpoints_rounded$Direction
-      brkpts <- sapply(rounded_brkpts, \(x) pt[, 1][which.min(abs(pt[, 1] - x))])
+      brkpts <- purrr::map_dbl(rounded_brkpts, \(x) pt[, 1][which.min(abs(pt[, 1] - x))])
       # extract p-values for coefficients other than intercept
       if (is.gee) {
         p_vals <- summary(marge.model$final_mod)$p[-1]

diff --git a/R/extractBreakpoints.R b/R/extractBreakpoints.R
@@ -17,15 +17,14 @@ extractBreakpoints <- function(model = NULL, directions = TRUE) {
   if (is.null(model)) { stop("Model input is missing from extractBreakpoints().") }
   if (!inherits(model, "marge")) { stop("Model must be of class marge.") }
   # find changepoints -> need to parse them out of coefficient names generated by marge2()
-  coef_names <- model$marge_coef_names
-  coef_names <- gsub("B_final", "", coef_names)
+  coef_names <- model$coef_names
   coef_names <- coef_names[-which(coef_names == "Intercept")]
-  coef_names <- gsub("\\)", "", gsub("\\(", "", coef_names))
   coef_nums <- gsub("[A-z]|[a-z]", "", coef_names)
-  brkpts_char <- gsub("-", "", gsub("_", "", coef_nums))
-  brkpt_df <- data.frame(Breakpoint = as.numeric(brkpts_char))
+  brkpt_df <- data.frame(Breakpoint = as.numeric(coef_nums))
   if (directions) {
-    brkpt_dirs <- ifelse(grepl("_[A-Z]-.*", coef_names), "Right", "Left")
+    brkpt_dirs <- ifelse(grepl("h_[a-z].*_[0-9]", coef_names, ignore.case = TRUE),
+                         "Right",
+                         "Left")
     brkpt_df$Direction <- brkpt_dirs
   }
   return(brkpt_df)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# scLANE 0.6.3

		* Added a `NEWS.md` file to track changes to the package.