Merge pull request #51 from mayer79/readme_example

Readme example
ModelOriented · Dec 17, 2022 · 5722446 · 5722446
2 parents 2b794e0 + b30d90f
commit 5722446
Show file tree

Hide file tree

Showing 20 changed files with 4,439 additions and 3,400 deletions.
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -1,3 +1,3 @@
-Version: 0.3.1
-Date: 2022-11-18 13:04:50 UTC
-SHA: 69cecaff0e7edcdf1fc4a9516d6b1ae2464f62c2
+Version: 0.3.2
+Date: 2022-12-17 09:49:19 UTC
+SHA: 95de1c9c09c75c43ee0407dc62a63bdfa78e2430
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: kernelshap
 Title: Kernel SHAP
-Version: 0.3.1
+Version: 0.3.2
 Authors@R: c(
     person("Michael", "Mayer", , "[email protected]", role = c("aut", "cre")),
     person("David", "Watson", , "[email protected]", role = "ctb")

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# kernelshap 0.3.2
+
+## Documentation
+
+- Rewritten README and examples to better show the role of the background data.
+
+## Bug fixes
+
+- When `bg_X` contained more columns than `X`, unflexible prediction functions could fail when being applied to `bg_X`. 
+
 # kernelshap 0.3.1
 
 ## Changes

diff --git a/R/kernelshap.R b/R/kernelshap.R
@@ -52,22 +52,20 @@
 #' 
 #' @param object Fitted model object.
 #' @param X A (n x p) matrix, data.frame, tibble or data.table of rows to be explained. 
-#' Important: The columns should only represent model features, not the response.
+#' The columns should only represent model features, not the response.
 #' @param bg_X Background data used to integrate out "switched off" features, 
 #' often a subset of the training data (typically 50 to 500 rows)
 #' It should contain the same columns as \code{X}.
 #' In cases with a natural "off" value (like MNIST digits), 
 #' this can also be a single row with all values set to the off value.
 #' @param pred_fun Prediction function of the form \code{function(object, X, ...)},
 #' providing K >= 1 numeric predictions per row. Its first argument represents the
-#' model \code{object}, its second argument a data structure like \code{X} and \code{bg_X}. 
-#' (The names of the first two arguments do not matter.) Additional (named)
-#' arguments are passed via \code{...}. The default, \code{stats::predict}, will
-#' work in most cases. Some exceptions (classes "ranger" and mlr3 "Learner")
-#' are handled separately. In other cases, the function must be specified manually.
+#' model \code{object}, its second argument a data structure like \code{X}. 
+#' Additional (named) arguments are passed via \code{...}. 
+#' The default, \code{stats::predict}, will work in most cases. 
 #' @param feature_names Optional vector of column names in \code{X} used to calculate 
-#' SHAP values. By default, this equals \code{colnames(X)}. Not supported for matrix
-#' \code{X}.
+#' SHAP values. By default, this equals \code{colnames(X)}. Not supported if \code{X}
+#' is a matrix.
 #' @param bg_w Optional vector of case weights for each row of \code{bg_X}.
 #' @param exact If \code{TRUE}, the algorithm will produce exact Kernel SHAP values
 #' with respect to the background data. In this case, the arguments \code{hybrid_degree}, 
@@ -135,47 +133,33 @@
 #'}
 #' @export
 #' @examples
-#' # Linear regression
+#' # MODEL ONE: Linear regression
 #' fit <- stats::lm(Sepal.Length ~ ., data = iris)
-#' s <- kernelshap(fit, iris[1:2, -1], bg_X = iris)
+#' 
+#' # Select rows to explain (only feature columns)
+#' X_explain <- iris[1:2, -1]
+#' 
+#' # Select small background dataset (could use all rows here because iris is small) 
+#' set.seed(1)
+#' bg_X <- iris[sample(nrow(iris), 100), ]
+#' 
+#' # Calculate SHAP values
+#' s <- kernelshap(fit, X_explain, bg_X = bg_X)
 #' s
 #' 
-#' # Multivariate model
+#' # MODEL TWO: Multi-response linear regression
 #' fit <- stats::lm(
 #'   as.matrix(iris[1:2]) ~ Petal.Length + Petal.Width + Species, data = iris
 #' )
-#' s <- kernelshap(fit, iris[1:4, 3:5], bg_X = iris)
+#' s <- kernelshap(fit, iris[1:4, 3:5], bg_X = bg_X)
 #' summary(s)
-#'
-#' # Matrix input works as well, and pred_fun can be overwritten
-#' fit <- stats::lm(Sepal.Length ~ ., data = iris[1:4])
-#' pred_fun <- function(fit, X) stats::predict(fit, as.data.frame(X))
-#' X <- data.matrix(iris[2:4])
-#' s <- kernelshap(fit, X[1:3, ], bg_X = X, pred_fun = pred_fun)
-#' s
-#'
-#' # Logistic regression
-#' fit <- stats::glm(
-#'   I(Species == "virginica") ~ Sepal.Length + Sepal.Width, 
-#'   data = iris, 
-#'   family = binomial
-#' )
-#' 
-#' # On scale of linear predictor
-#' s <- kernelshap(fit, iris[1:2], bg_X = iris)
-#' s
-#' 
-#' # On scale of response (probability)
-#' s <- kernelshap(fit, iris[1:2], bg_X = iris, type = "response")
-#' s
 #' 
 #' # Non-feature columns can be dropped via 'feature_names'
-#' fit <- stats::lm(Sepal.Length ~ . - Species, data = iris)
 #' s <- kernelshap(
 #'   fit, 
-#'   iris[1:2, ], 
-#'   bg_X = iris, 
-#'   feature_names = c("Sepal.Width", "Petal.Length", "Petal.Width")
+#'   iris[1:4, ],
+#'   bg_X = bg_X, 
+#'   feature_names = c("Petal.Length", "Petal.Width", "Species")
 #' )
 #' s
 kernelshap <- function(object, ...){
@@ -202,7 +186,8 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict,
     !is.null(colnames(bg_X)),
     (p <- length(feature_names)) >= 1L,
     all(feature_names %in% colnames(X)),
-    all(feature_names %in% colnames(bg_X)),
+    all(feature_names %in% colnames(bg_X)),  # not necessary, but clearer
+    all(colnames(X) %in% colnames(bg_X)),
     is.function(pred_fun),
     exact %in% c(TRUE, FALSE),
     p == 1L || exact || hybrid_degree %in% 0:(p / 2),
@@ -218,10 +203,12 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict,
     stop("If X is a matrix, feature_names must equal colnames(X)")  
   }
 
-  # Calculate v0 and v1
-  bg_preds <- check_pred(pred_fun(object, bg_X, ...), n = bg_n)
-  v0 <- weighted_colMeans(bg_preds, bg_w)            # Average pred of bg data: 1 x K
+  # Calculate v1 and v0
   v1 <- check_pred(pred_fun(object, X, ...), n = n)  # Predictions on X:        n x K
+  bg_preds <- check_pred(
+    pred_fun(object, bg_X[, colnames(X), drop = FALSE], ...), n = bg_n
+  )
+  v0 <- weighted_colMeans(bg_preds, bg_w)            # Average pred of bg data: 1 x K
 
   # For p = 1, exact Shapley values are returned
   if (p == 1L) {
@@ -231,6 +218,7 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict,
   }
 
   # Drop unnecessary columns in bg_X. If X is matrix, also column order is relevant
+  # In what follows, predictions will never be applied directly to bg_X anymore
   if (!identical(colnames(bg_X), feature_names)) {
     bg_X <- bg_X[, feature_names, drop = FALSE]
   }