Merge pull request #59 from mayer79/midpoints

Switch grid strategy "uniform" from pretty() to seq()
ModelOriented · Sep 16, 2023 · 4850fa3 · 4850fa3
2 parents 7c1fdcf + 959d837
commit 4850fa3
Show file tree

Hide file tree

Showing 20 changed files with 1,255 additions and 1,401 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hstats
 Title: Interaction Statistics
-Version: 0.2.1
+Version: 0.3.0
 Authors@R: 
     person("Michael", "Mayer", , "[email protected]", role = c("aut", "cre"))
 Description: Fast, model-agnostic implementation of different H-statistics

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,8 @@
-# hstats 0.2.1
+# hstats 0.3.0
+
+## Major user visible changes
+
+- Grid calculation: So far, the default grid strategy "uniform" used `pretty()` to generate the evaluation points. To provide more predictable grid sizes, and to be more in line with other implementations of partial dependence and ICE, we now use `seq()` to create the uniform grid. This affects `ice()`, `partial_dep()` and the exported helper functions `univariate_grid()` and `multivariate_grid()`.
 
 ## Bug fixes
 

diff --git a/R/grid.R b/R/grid.R
@@ -6,22 +6,24 @@
 #' For discrete `z` (non-numeric, or numeric with at most `grid_size` unique values), 
 #' this is simply `sort(unique(z))`.
 #' 
-#' Otherwise, if `strategy = "uniform"` (default), the evaluation points are the 
-#' result of [pretty()] over the trimmed range of `z`.
-#' If `strategy = "quantile"`, the evaluation points are computed
-#' as quantiles over a regular grid of probabilities from `trim[1]` to `trim[2]`. 
-#' Set `trim = c(0, 1)` for no trimming.
+#' Otherwise, if `strategy = "uniform"` (default), the evaluation points form a regular
+#' grid over the trimmed range of `z`. By trimmed range we mean the
+#' range of `z` after removing values outside the `trim[1]` and `trim[2]` quantiles.
+#' Set `trim = 0:1` for no trimming.
 #' 
-#' Quantiles are calculated based on the inverse of the ECDF, i.e., with
+#' If `strategy = "quantile"`, the evaluation points are quantiles over a regular grid 
+#' of probabilities from `trim[1]` to `trim[2]`.
+#' 
+#' All quantiles are calculated via the inverse of the ECDF, i.e., via
 #' `stats::quantile(..., type = 1`).
 #' 
-#' @param z A vector/factor.
+#' @param z A vector or factor.
 #' @param grid_size Approximate grid size.
-#' @param trim A non-discrete numeric variable is trimmed at these quantile 
-#'   probabilities before calculations. Set to `c(0, 1)` for no trimming.
+#' @param trim The default `c(0.01, 0.99)` means that values outside the 
+#'   1% and 99% quantile of a non-discrete numeric `z` are removed before calculating 
+#'   the grid values. Set to `0:1` for no trimming.
 #' @param strategy How to find evaluation points of non-discrete numeric columns? 
-#'   Either "uniform" (via [pretty()]) or "quantile", see description of 
-#'   [univariate_grid()].
+#'   Either "uniform" or "quantile", see description of [univariate_grid()].
 #' @returns A vector/factor of evaluation points.
 #' @seealso [multivariate_grid()]
 #' @export
@@ -31,7 +33,7 @@
 #' 
 #' x <- iris$Sepal.Width
 #' univariate_grid(x, grid_size = 5)                        # Quantile binning
-#' univariate_grid(x, grid_size = 3, strategy = "uniform")  # Uniform pretty
+#' univariate_grid(x, grid_size = 5, strategy = "uniform")  # Uniform
 univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99), 
                             strategy = c("uniform", "quantile")) {
   strategy <- match.arg(strategy)
@@ -47,9 +49,10 @@ univariate_grid <- function(z, grid_size = 49L, trim = c(0.01, 0.99),
     return(unique(g))
   }
 
-  # strategy = "uniform" (should use range() if trim = c(0, 1)?)
+  # strategy = "uniform" (could use range() if trim = 0:1)
   r <- stats::quantile(z, probs = trim, names = FALSE, type = 1L, na.rm = TRUE)
-  pretty(r, n = grid_size)
+  # pretty(r, n = grid_size)  # Until version 0.2.0
+  seq(r[1L], r[2L], length.out = grid_size)
 }
 
 #' Multivariate Grid

diff --git a/R/ice.R b/R/ice.R
@@ -45,11 +45,7 @@
 #' plot(ic, center = TRUE)
 #'
 #' # MODEL 3: Gamma GLM -> pass options to predict() via ...
-#' fit <- glm(
-#'   Sepal.Length ~ . + Petal.Width:Species,
-#'   data = iris,
-#'   family = Gamma(link = log)
-#' )
+#' fit <- glm(Sepal.Length ~ ., data = iris, family = Gamma(link = log))
 #' plot(ice(fit, v = "Petal.Length", X = iris, BY = "Species"))
 #' plot(ice(fit, v = "Petal.Length", X = iris, type = "response", BY = "Species"))
 ice <- function(object, ...) {

diff --git a/R/partial_dep.R b/R/partial_dep.R
@@ -82,11 +82,7 @@
 #' pd
 #' 
 #' # MODEL 3: Gamma GLM -> pass options to predict() via ...
-#' fit <- glm(
-#'   Sepal.Length ~ . + Petal.Width:Species, 
-#'   data = iris, 
-#'   family = Gamma(link = log)
-#' )
+#' fit <- glm(Sepal.Length ~ ., data = iris, family = Gamma(link = log))
 #' plot(partial_dep(fit, v = "Petal.Length", X = iris))
 #' plot(partial_dep(fit, v = "Petal.Length", X = iris, type = "response"))
 partial_dep <- function(object, ...) {

diff --git a/README.md b/README.md
@@ -158,7 +158,7 @@ Let's study different plots to understand *how* the strong interaction between d
 They all reveal a substantial interaction between the two variables in the sense that the age effect gets weaker the closer to the ocean. Note that numeric `BY` features are automatically binned into quartile groups.
 
 ```r
-plot(partial_dep(fit, v = "age", X = X_train, BY = "log_ocean"))
+plot(partial_dep(fit, v = "age", X = X_train, BY = "log_ocean"), show_points = FALSE)
 ```
 
 ![](man/figures/pdp_ocean_age.svg)