Merge pull request #101 from mayer79/fdummy

Support factor predictions
ModelOriented · Nov 7, 2023 · 2d36df7 · 2d36df7
2 parents a4bc1fb + e03a082
commit 2d36df7
Show file tree

Hide file tree

Showing 27 changed files with 816 additions and 294 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hstats
 Title: Interaction Statistics
-Version: 1.0.1
+Version: 1.1.0
 Authors@R: 
     person("Michael", "Mayer", , "[email protected]", role = c("aut", "cre"))
 Description: Fast, model-agnostic implementation of different H-statistics

diff --git a/NEWS.md b/NEWS.md
@@ -1,17 +1,21 @@
-# hstats 1.0.1
+# hstats 1.1.0
 
 ## Enhancements
 
-- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. This might give a better impression where the interaction happens. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/mayer79/hstats/issues/91), see also [issue #94](https://github.com/mayer79/hstats/issues/94).
+- {hstats} now also work for factor predictions. The levels are represented by one-hot-encoded columns.
+- The plot method of a two-dimensional PDP has recieved the option `d2_geom = "line"`. Instead of a heatmap of the two features, one of the features is moved to color grouping. Combined with `swap_dim = TRUE`, you can swap the role of the two `v` variables without recalculating anything. The idea was proposed by [Roel Verbelen](https://github.com/RoelVerbelen) in [issue #91](https://github.com/mayer79/hstats/issues/91), see also [issue #94](https://github.com/mayer79/hstats/issues/94).
 
 ## Bug fixes
 
 - Using `BY` and `w` via column names would fail for tibbles. This problem was described in [#92](https://github.com/mayer79/hstats/issues/92) by [Roel Verbelen](https://github.com/RoelVerbelen). Thx!
 
 ## Other changes
 
+- Much faster one-hot-encoding, thanks to Mathias Ambühl.
+- Most functions are slightly faster.
 - Add unit tests to compare against {iml}.
 - Made all examples "tibble" and "data.table" friendly.
+- Revised input checks in loss functions (relevant for `perm_importance()` and `average_loss()`).
 
 # hstats 1.0.0
 

diff --git a/R/average_loss.R b/R/average_loss.R
@@ -20,7 +20,7 @@
 #'   a (n x K) matrix of probabilities (with row-sums 1).
 #'   The observed values `y` are either passed as (n x K) dummy matrix, 
 #'   or as discrete vector with corresponding levels. 
-#'   The latter case is turned into a dummy matrix via 
+#'   The latter case is turned into a dummy matrix by a fast version of
 #'   `model.matrix(~ as.factor(y) + 0)`.
 #' - "classification_error": Misclassification error. Both the 
 #'   observed values `y` and the predictions can be character/factor. This
@@ -37,8 +37,10 @@
 #'   can be provided that turns observed and predicted values into a numeric vector or 
 #'   matrix of unit losses of the same length as `X`.
 #'   For "mlogloss", the response `y` can either be a dummy matrix or a discrete vector. 
-#'   The latter case is handled via `model.matrix(~ as.factor(y) + 0)`.
+#'   The latter case is handled via a fast version of `model.matrix(~ as.factor(y) + 0)`.
 #'   For "classification_error", both predictions and responses can be non-numeric.
+#'   For "squared_error", both predictions and responses can be factors with identical
+#'   levels. In this case, squared error is evaulated for each one-hot-encoded column.
 #' @param agg_cols Should multivariate losses be summed up? Default is `FALSE`.
 #'   In combination with the squared error loss, `agg_cols = TRUE` gives
 #'   the Brier score for (probabilistic) classification.
@@ -92,8 +94,8 @@ average_loss.default <- function(object, X, y,
   }
 
   # Real work
-  L <- as.matrix(loss(y, pred_fun(object, X, ...)))
-  M <- gwColMeans(L, g = BY, w = w)[["M"]]
+  pred <- prepare_pred(pred_fun(object, X, ...))
+  M <- gwColMeans(loss(y, pred), g = BY, w = w)[["M"]]
 
   if (agg_cols && ncol(M) > 1L) {
     M <- cbind(rowSums(M))

diff --git a/R/hstats.R b/R/hstats.R
@@ -188,7 +188,7 @@ hstats.default <- function(object, X, v = NULL,
   }
 
   # Predictions ("F" in Friedman and Popescu) always calculated (cheap)
-  f <- wcenter(align_pred(pred_fun(object, X, ...)), w = w)
+  f <- wcenter(prepare_pred(pred_fun(object, X, ...), ohe = TRUE), w = w)
   mean_f2 <- wcolMeans(f^2, w = w)  # A vector
 
   # Initialize first progress bar

diff --git a/R/ice.R b/R/ice.R
@@ -109,9 +109,19 @@ ice.default <- function(object, v, X, pred_fun = stats::predict,
   }
 
   ice_out <- ice_raw(
-    object, v = v, X = X, grid = grid, pred_fun = pred_fun, pred_only = FALSE, ...
+    object, 
+    v = v, 
+    X = X, 
+    grid = grid, 
+    pred_fun = pred_fun, 
+    pred_only = FALSE,
+    ohe = TRUE,
+    ...
   )
   pred <- ice_out[["pred"]]
+  if (!is.matrix(pred)) {
+    pred <- as.matrix(pred)
+  }
   grid_pred <- ice_out[["grid_pred"]]
   K <- ncol(pred)
   if (is.null(colnames(pred))) {

diff --git a/R/losses.R b/R/losses.R
@@ -5,11 +5,13 @@
 #' @noRd
 #' @keywords internal
 #'
-#' @param actual A numeric vector/matrix.
-#' @param predicted A numeric vector/matrix.
+#' @param actual A numeric vector/matrix, or factor.
+#' @param predicted A numeric vector/matrix, or factor.
 #' @returns Vector or matrix of numeric losses.
 loss_squared_error <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual, ohe = TRUE))
+  predicted <- prepare_pred(predicted, ohe = TRUE)
+
   (actual - predicted)^2
 }
 
@@ -24,7 +26,12 @@ loss_squared_error <- function(actual, predicted) {
 #' @param predicted A numeric vector/matrix.
 #' @returns Vector or matrix of numeric losses.
 loss_absolute_error <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual))
+  predicted <- prepare_pred(predicted)
+  if (is.factor(actual) || is.factor(predicted)) {
+    stop("Absolute loss does not make sense for factors.")
+  }
+
   abs(actual - predicted)
 }
 
@@ -39,11 +46,16 @@ loss_absolute_error <- function(actual, predicted) {
 #' @param predicted A numeric vector/matrix with non-negative values.
 #' @returns Vector or matrix of numeric losses.
 loss_poisson <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual))
+  predicted <- prepare_pred(predicted)
+  if (is.factor(actual) || is.factor(predicted)) {
+    stop("Poisson loss does not make sense for factors.")
+  }
   stopifnot(
     all(predicted >= 0),
     all(actual >= 0)
   )
+
   out <- predicted
   p <- actual > 0
   out[p] <- (actual * log(actual / predicted) - (actual  - predicted))[p]
@@ -61,11 +73,16 @@ loss_poisson <- function(actual, predicted) {
 #' @param predicted A numeric vector/matrix with positive values.
 #' @returns Vector or matrix of numeric losses.
 loss_gamma <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual))
+  predicted <- prepare_pred(predicted)
+  if (is.factor(actual) || is.factor(predicted)) {
+    stop("Gamma loss does not make sense for factors.")
+  }
   stopifnot(
     all(predicted > 0), 
     all(actual > 0)
   )
+
   -2 * (log(actual / predicted) - (actual - predicted) / predicted)
 }
 
@@ -80,7 +97,9 @@ loss_gamma <- function(actual, predicted) {
 #' @param predicted A vector/factor/matrix with discrete values.
 #' @returns Vector or matrix of numeric losses.
 loss_classification_error <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual))
+  predicted <- prepare_pred(predicted)
+
   (actual != predicted) * 1.0
 }
 
@@ -96,13 +115,18 @@ loss_classification_error <- function(actual, predicted) {
 #' @param predicted A numeric vector/matrix with values between 0 and 1.
 #' @returns Vector or matrix of numeric losses.
 loss_logloss <- function(actual, predicted) {
-  actual <- expand_actual(actual = actual, predicted = predicted)
+  actual <- drop(prepare_pred(actual))
+  predicted <- prepare_pred(predicted)
+  if (is.factor(actual) || is.factor(predicted)) {
+    stop("Log loss does not make sense for factors.")
+  }
   stopifnot(
     all(predicted >= 0), 
     all(predicted <= 1),
     all(actual >= 0),
     all(actual <= 1)
   )
+
   -xlogy(actual, predicted) - xlogy(1 - actual, 1 - predicted)
 }
 
@@ -115,61 +139,30 @@ loss_logloss <- function(actual, predicted) {
 #' @keywords internal
 #'
 #' @param actual A numeric matrix with values between 0 and 1, or a 
-#'   discrete vector that will be one-hot-encoded via 
+#'   discrete vector that will be one-hot-encoded by a fast version of
 #'   `model.matrix(~ as.factor(actual) + 0)`.
 #'   The column order of `predicted` must be in line with this!
 #' @param predicted A numeric matrix with values between 0 and 1.
 #' @returns `TRUE` (or an error message).
 loss_mlogloss <- function(actual, predicted) {
-  if (NCOL(actual) == 1L && NCOL(predicted) > 1L) {
-    actual <- stats::model.matrix(~ as.factor(actual) + 0)
+  actual <- prepare_pred(actual)
+  predicted <- prepare_pred(predicted)
+  if (NCOL(actual) == 1L) {  # not only for factors
+    actual <- fdummy(actual)
   }
   stopifnot(
-    NCOL(actual) == NCOL(predicted),
+    is.matrix(predicted),
+    ncol(predicted) >= 2L,
+    ncol(actual) == ncol(predicted),
     all(predicted >= 0), 
     all(predicted <= 1),
     all(actual >= 0),
     all(actual <= 1)
   )
+
   unname(-rowSums(xlogy(actual, predicted)))
 }
 
-#' Expands 1D actual to NCOL(predicted)
-#' 
-#' Internal function. Checks consistency of column counts. If `NCOL(actual) == 1`
-#' and `NCOL(predicted) > 1`, replicates `actual` into a matrix.
-#' 
-#' @noRd
-#' @keywords internal
-#'
-#' @param actual A vector/matrix/data.frame.
-#' @param predicted A vector/matrix/data.frame.
-#' @returns A matrix.
-expand_actual <- function(actual, predicted) {
-  pp <- NCOL(predicted)
-  pa <- NCOL(actual)
-  if (pa == pp) {
-    if (pa > 1L) {
-      nmp <- colnames(predicted)
-      nma <- colnames(actual)
-      if (!is.null(nmp) && !is.null(nma) && !identical(nmp, nma)) {
-        stop("Column names of multi-output response must correspond to predictions.")
-      }
-    }
-    return(actual)
-  }
-  if (pp > 1L && pa == 1L) {
-    if (is.data.frame(actual)) {
-      actual <- as.matrix(actual)
-    }
-    actual <- matrix(actual, nrow = NROW(actual), ncol = pp, byrow = FALSE)
-    colnames(actual) <- colnames(predicted)
-  } else {
-    stop("NCOL(actual) should be identical to NCOL(predicted), or equal to 1")
-  }
-  return(actual)
-}
-
 #' Calculates x*log(y)
 #' 
 #' Internal function. Returns 0 whenever x = 0 and y >= 0.

diff --git a/R/pd_raw.R b/R/pd_raw.R
@@ -50,14 +50,15 @@ pd_raw <- function(object, v, X, grid, pred_fun = stats::predict,
 #' @keywords internal
 #' 
 #' @inheritParams pd_raw
-#' @param pred_only Logical flag determining the output mode. If `TRUE`, then a matrix
-#'   of predictions. Otherwise, a list with two elements: `pred` (prediction matrix)
+#' @param pred_only Logical flag determining the output mode. If `TRUE`, just
+#'   predictions. Otherwise, a list with two elements: `pred` (predictions)
 #'   and `grid_pred` (the corresponding grid values in the same mode as the input, 
 #'   but replicated over `X`).
+#' @param ohe Should factor output be one-hot encoded? Default is `FALSE`.
 #' @returns 
-#'   Either a matrix of predictions or a list with predictions and grid.
+#'   Either a vector/matrix of predictions or a list with predictions and grid.
 ice_raw <- function(object, v, X, grid, pred_fun = stats::predict, 
-                    pred_only = TRUE, ...) {
+                    pred_only = TRUE, ohe = FALSE, ...) {
   D1 <- length(v) == 1L
   n <- nrow(X)
   n_grid <- NROW(grid)
@@ -77,8 +78,8 @@ ice_raw <- function(object, v, X, grid, pred_fun = stats::predict,
     X_pred[, v] <- grid_pred
   }
 
-  # Calculate matrix of predictions
-  pred <- align_pred(pred_fun(object, X_pred, ...))
+  # Calculate matrix/vector of predictions
+  pred <- prepare_pred(pred_fun(object, X_pred, ...), ohe = ohe)
 
   if (pred_only) {
     return(pred)

diff --git a/R/perm_importance.R b/R/perm_importance.R
@@ -31,6 +31,7 @@
 #' # MODEL 1: Linear regression
 #' fit <- lm(Sepal.Length ~ ., data = iris)
 #' s <- perm_importance(fit, X = iris, y = "Sepal.Length")
+
 #' s
 #' s$M
 #' s$SE  # Standard errors are available thanks to repeated shuffling
@@ -118,8 +119,8 @@ perm_importance.default <- function(object, X, y, v = NULL,
   }
 
   # Pre-shuffle performance
-  L <- as.matrix(loss(y, pred_fun(object, X, ...)))
-  perf <- wcolMeans(L, w = w)
+  pred <- prepare_pred(pred_fun(object, X, ...))
+  perf <- wcolMeans(loss(y, pred), w = w)
 
   # Stack y and X m times
   if (m_rep > 1L) {
@@ -133,10 +134,10 @@ perm_importance.default <- function(object, X, y, v = NULL,
   }
 
   shuffle_perf <- function(z, XX) {
-    ind <- c(replicate(m_rep, sample(seq_len(n))))
+    ind <- c(replicate(m_rep, sample(seq_len(n))))  # shuffle within n rows
     XX[, z] <- XX[ind, z]
-    L <- as.matrix(loss(y, pred_fun(object, XX, ...)))
-    t(wrowmean(L, ngroups = m_rep, w = w))
+    pred <- prepare_pred(pred_fun(object, XX, ...))
+    t(wrowmean(loss(y, pred), ngroups = m_rep, w = w))
   }
 
   # Step 0: Performance after shuffling (expensive)