diff --git a/DESCRIPTION b/DESCRIPTION index 69b1564..35860a9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,7 @@ Suggests: gbm, jsonlite, lightgbm, + gpboost, randomForest, ranger, scales, diff --git a/NAMESPACE b/NAMESPACE index 9c3dc20..a65fee7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ S3method(treeshap,model_unified_multioutput) S3method(unify,default) S3method(unify,gbm) S3method(unify,lgb.Booster) +S3method(unify,gpb.Booster) S3method(unify,randomForest) S3method(unify,ranger) S3method(unify,xgb.Booster) @@ -19,6 +20,7 @@ export(gbm.unify) export(is.model_unified) export(is.treeshap) export(lightgbm.unify) +export(gpboost.unify) export(plot_contribution) export(plot_feature_dependence) export(plot_feature_importance) diff --git a/R/treeshap.R b/R/treeshap.R index ddc8117..04754a9 100644 --- a/R/treeshap.R +++ b/R/treeshap.R @@ -20,6 +20,7 @@ #' @seealso #' \code{\link{xgboost.unify}} for \code{XGBoost models} #' \code{\link{lightgbm.unify}} for \code{LightGBM models} +#' \code{\link{gpboost.unify}} for \code{GPBoost models} #' \code{\link{gbm.unify}} for \code{GBM models} #' \code{\link{randomForest.unify}} for \code{randomForest models} #' \code{\link{ranger.unify}} for \code{ranger models} diff --git a/R/unify.R b/R/unify.R index 10c84c2..f3035df 100644 --- a/R/unify.R +++ b/R/unify.R @@ -13,6 +13,8 @@ #' @seealso #' \code{\link{lightgbm.unify}} for \code{\link[lightgbm:lightgbm]{LightGBM models}} #' +#' \code{\link{gpboost.unify}} for \code{\link[gpboost:gpboost]{GPBoost models}} +#' #' \code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}} #' #' \code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}} @@ -54,6 +56,11 @@ unify.lgb.Booster <- function(model, data, recalculate = FALSE, ...){ lightgbm.unify(model, data, recalculate) } +#' @export +unify.gpb.Booster <- function(model, data, recalculate = FALSE, ...){ + gpboost.unify(model, data, recalculate) +} + #' @export unify.randomForest <- function(model, data, ...){ randomForest.unify(model, data) diff --git a/R/unify_gpboost.R b/R/unify_gpboost.R new file mode 100644 index 0000000..6f87a5c --- /dev/null +++ b/R/unify_gpboost.R @@ -0,0 +1,114 @@ +# should be preceded with gpb.model.dt.tree +#' Unify GPBoost model +#' +#' Convert your GPBoost model into a standardized representation. +#' The returned representation is easy to be interpreted by the user and ready to be used as an argument in \code{treeshap()} function. +#' +#' @param gpb_model A gpboost model - object of class \code{gpb.Booster} +#' @param data Reference dataset. A \code{data.frame} or \code{matrix} with the same columns as in the training set of the model. Usually dataset used to train model. +#' @param recalculate logical indicating if covers should be recalculated according to the dataset given in data. Keep it \code{FALSE} if training data are used. +#' +#' @return a unified model representation - a \code{\link{model_unified.object}} object +#' +#' @export +#' +#' @import data.table +#' +#' @seealso +#' +#' \code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}} +#' +#' \code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}} +#' +#' \code{\link{lightgbm.unify}} for \code{\link[lightgbm:lightgbm]{LightGBM models}} +#' +#' \code{\link{ranger.unify}} for \code{\link[ranger:ranger]{ranger models}} +#' +#' \code{\link{randomForest.unify}} for \code{\link[randomForest:randomForest]{randomForest models}} +#' +#' @examples +#' \donttest{ +#' library(gpboost) +#' param_gpb <- list(objective = "regression", max_depth = 2, +#' force_row_wise = TRUE, num_iterations = 20) +#' data_fifa <- fifa20$data[!colnames(fifa20$data) %in% +#' c('work_rate', 'value_eur', 'gk_diving', 'gk_handling', +#' 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')] +#' data <- na.omit(cbind(data_fifa, fifa20$target)) +#' sparse_data <- as.matrix(data[,-ncol(data)]) +#' x <- gpboost::gpb.Dataset(sparse_data, label = as.matrix(data[,ncol(data)])) +#' gpb_data <- gpboost::gpb.Dataset.construct(x) +#' gpb_model <- gpboost::gpboost(data = gpb_data, params = param_gpb, +#' verbose = -1, num_threads = 0) +#' unified_model <- gpboost.unify(gpb_model, sparse_data) +#' shaps <- treeshap(unified_model, data[1:2, ]) +#' plot_contribution(shaps, obs = 1) +#' } +gpboost.unify <- function(gpb_model, data, recalculate = FALSE) { + if (!requireNamespace("gpboost", quietly = TRUE)) { + stop("Package \"gpboost\" needed for this function to work. Please install it.", + call. = FALSE) + } + df <- gpboost::gpb.model.dt.tree(gpb_model) + stopifnot(c("split_index", "split_feature", "node_parent", "leaf_index", "leaf_parent", "internal_value", + "internal_count", "leaf_value", "leaf_count", "decision_type") %in% colnames(df)) + df <- data.table::as.data.table(df) + #convert node_parent and leaf_parent into one parent column + df[is.na(df$node_parent), "node_parent"] <- df[is.na(df$node_parent), "leaf_parent"] + #convert values into one column... + df[is.na(df$internal_value), "internal_value"] <- df[!is.na(df$leaf_value), "leaf_value"] + #...and counts + df[is.na(df$internal_count), "internal_count"] <- df[!is.na(df$leaf_count), "leaf_count"] + df[["internal_count"]] <- as.numeric(df[["internal_count"]]) + #convert split_index and leaf_index into one column: + max_split_index <- df[, max(split_index, na.rm = TRUE), tree_index] + rep_max_split <- rep(max_split_index$V1, times = as.numeric(table(df$tree_index))) + new_leaf_index <- rep_max_split + df[, "leaf_index"] + 1 + df[is.na(df$split_index), "split_index"] <- new_leaf_index[!is.na(new_leaf_index[["leaf_index"]]), 'leaf_index'] + df[is.na(df$split_gain), "split_gain"] <- df[is.na(df$split_gain), "leaf_value"] + # On the basis of column 'Parent', create columns with childs: 'Yes', 'No' and 'Missing' like in the xgboost df: + ret.first <- function(x) x[1] + ret.second <- function(x) x[2] + tmp <- data.table::merge.data.table(df[, .(node_parent, tree_index, split_index)], df[, .(tree_index, split_index, default_left, decision_type)], + by.x = c("tree_index", "node_parent"), by.y = c("tree_index", "split_index")) + y_n_m <- unique(tmp[, .(Yes = ifelse(decision_type %in% c("<=", "<"), ret.first(split_index), + ifelse(decision_type %in% c(">=", ">"), ret.second(split_index), stop("Unknown decision_type"))), + No = ifelse(decision_type %in% c(">=", ">"), ret.first(split_index), + ifelse(decision_type %in% c("<=", "<"), ret.second(split_index), stop("Unknown decision_type"))), + Missing = ifelse(default_left, ret.first(split_index),ret.second(split_index)), + decision_type = decision_type), + .(tree_index, node_parent)]) + df <- data.table::merge.data.table(df[, c("tree_index", "depth", "split_index", "split_feature", "node_parent", "split_gain", + "threshold", "internal_value", "internal_count")], + y_n_m, by.x = c("tree_index", "split_index"), + by.y = c("tree_index", "node_parent"), all.x = TRUE) + df[decision_type == ">=", decision_type := "<"] + df[decision_type == ">", decision_type := "<="] + df$Decision.type <- factor(x = df$decision_type, levels = c("<=", "<")) + df[is.na(split_index), Decision.type := NA] + df <- df[, c("tree_index", "split_index", "split_feature", "Decision.type", "threshold", "Yes", "No", "Missing", "split_gain", "internal_count")] + colnames(df) <- c("Tree", "Node", "Feature", "Decision.type", "Split", "Yes", "No", "Missing", "Prediction", "Cover") + attr(df, "sorted") <- NULL + + ID <- paste0(df$Node, "-", df$Tree) + df$Yes <- match(paste0(df$Yes, "-", df$Tree), ID) + df$No <- match(paste0(df$No, "-", df$Tree), ID) + df$Missing <- match(paste0(df$Missing, "-", df$Tree), ID) + + # Here we lose "Quality" information + df$Prediction[!is.na(df$Feature)] <- NA + + feature_names <- jsonlite::fromJSON(gpb_model$dump_model())$feature_names + data <- data[,colnames(data) %in% feature_names] + + ret <- list(model = as.data.frame(df), data = as.data.frame(data), feature_names = feature_names) + class(ret) <- "model_unified" + attr(ret, "missing_support") <- TRUE + attr(ret, "model") <- "gpboost" + + if (recalculate) { + ret <- set_reference_dataset(ret, as.data.frame(data)) + } + + return(ret) +} diff --git a/README.Rmd b/README.Rmd index 39c90b8..e0bed48 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,7 +22,7 @@ set.seed(21) [![CRAN status](https://www.r-pkg.org/badges/version/treeshap)](https://CRAN.R-project.org/package=treeshap) -In the era of complicated classifiers conquering their market, sometimes even the authors of algorithms do not know the exact manner of building a tree ensemble model. The difficulties in models' structures are one of the reasons why most users use them simply like black-boxes. But, how can they know whether the prediction made by the model is reasonable? `treeshap` is an efficient answer for this question. Due to implementing an optimized algorithm for tree ensemble models (called TreeSHAP), it calculates the SHAP values in polynomial (instead of exponential) time. Currently, `treeshap` supports models produced with `xgboost`, `lightgbm`, `gbm`, `ranger`, and `randomForest` packages. Support for `catboost` is available only in [`catboost` branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see why [here](#catboost)). +In the era of complicated classifiers conquering their market, sometimes even the authors of algorithms do not know the exact manner of building a tree ensemble model. The difficulties in models' structures are one of the reasons why most users use them simply like black-boxes. But, how can they know whether the prediction made by the model is reasonable? `treeshap` is an efficient answer for this question. Due to implementing an optimized algorithm for tree ensemble models (called TreeSHAP), it calculates the SHAP values in polynomial (instead of exponential) time. Currently, `treeshap` supports models produced with `xgboost`, `lightgbm`, `gpboost`, `gbm`, `ranger`, and `randomForest` packages. Support for `catboost` is available only in [`catboost` branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see why [here](#catboost)). ## Installation diff --git a/README.md b/README.md index b3c1f1c..4616bb6 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ can they know whether the prediction made by the model is reasonable? an optimized algorithm for tree ensemble models (called TreeSHAP), it calculates the SHAP values in polynomial (instead of exponential) time. Currently, `treeshap` supports models produced with `xgboost`, -`lightgbm`, `gbm`, `ranger`, and `randomForest` packages. Support for +`lightgbm`, `gpboost`, `gbm`, `ranger`, and `randomForest` packages. Support for `catboost` is available only in [`catboost` branch](https://github.com/ModelOriented/treeshap/tree/catboost) (see why [here](#catboost)). diff --git a/man/gpboost.unify.Rd b/man/gpboost.unify.Rd new file mode 100644 index 0000000..2125e77 --- /dev/null +++ b/man/gpboost.unify.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unify_gpboost.R +\name{gpboost.unify} +\alias{gpboost.unify} +\title{Unify GPBoost model} +\usage{ +gpboost.unify(gpb_model, data, recalculate = FALSE) +} +\arguments{ +\item{gpb_model}{A gpboost model - object of class \code{gpb.Booster}} + +\item{data}{Reference dataset. A \code{data.frame} or \code{matrix} with the same columns as in the training set of the model. Usually dataset used to train model.} + +\item{recalculate}{logical indicating if covers should be recalculated according to the dataset given in data. Keep it \code{FALSE} if training data are used.} +} +\value{ +a unified model representation - a \code{\link{model_unified.object}} object +} +\description{ +Convert your GPBoost model into a standardized representation. +The returned representation is easy to be interpreted by the user and ready to be used as an argument in \code{treeshap()} function. +} +\examples{ +\donttest{ +library(gpboost) +param_gpb <- list(objective = "regression", max_depth = 2, + force_row_wise = TRUE, num_iterations = 20) +data_fifa <- fifa20$data[!colnames(fifa20$data) \%in\% + c('work_rate', 'value_eur', 'gk_diving', 'gk_handling', + 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')] +data <- na.omit(cbind(data_fifa, fifa20$target)) +sparse_data <- as.matrix(data[,-ncol(data)]) +x <- gpboost::gpb.Dataset(sparse_data, label = as.matrix(data[,ncol(data)])) +gpb_data <- gpboost::gpb.Dataset.construct(x) +gpb_model <- gpboost::gpboost(data = gpb_data, params = param_gpb, + verbose = -1, num_threads = 0) +unified_model <- gpboost.unify(gpb_model, sparse_data) +shaps <- treeshap(unified_model, data[1:2, ]) +plot_contribution(shaps, obs = 1) +} +} +\seealso{ +\code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}} + +\code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}} + +\code{\link{lightgm.unify}} for \code{\link[lightgm:lightgm]{LightGBM models}} + +\code{\link{ranger.unify}} for \code{\link[ranger:ranger]{ranger models}} + +\code{\link{randomForest.unify}} for \code{\link[randomForest:randomForest]{randomForest models}} +} diff --git a/man/unify.Rd b/man/unify.Rd index f1dcf56..2054022 100644 --- a/man/unify.Rd +++ b/man/unify.Rd @@ -41,6 +41,8 @@ The returned representation is easy to be interpreted by the user and ready to b \seealso{ \code{\link{lightgbm.unify}} for \code{\link[lightgbm:lightgbm]{LightGBM models}} +\code{\link{gpboost.unify}} for \code{\link[gpboost:gpboost]{GPBoost models}} + \code{\link{gbm.unify}} for \code{\link[gbm:gbm]{GBM models}} \code{\link{xgboost.unify}} for \code{\link[xgboost:xgboost]{XGBoost models}} diff --git a/tests/testthat/test_gpboost_unify.R b/tests/testthat/test_gpboost_unify.R new file mode 100644 index 0000000..99d42c3 --- /dev/null +++ b/tests/testthat/test_gpboost_unify.R @@ -0,0 +1,181 @@ +library(treeshap) +param_gpboost <- list(objective = "regression", + max_depth = 3, + force_row_wise = TRUE, + learning.rate = 0.1) + +data_fifa <- fifa20$data[!colnames(fifa20$data)%in%c('work_rate', 'value_eur', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning')] +data <- as.matrix(na.omit(data.table::as.data.table(cbind(data_fifa, fifa20$target)))) +sparse_data <- data[,-ncol(data)] +x <- gpboost::gpb.Dataset(sparse_data, label = data[,ncol(data)]) +gpb_data <- gpboost::gpb.Dataset.construct(x) +gpb_fifa <- gpboost::gpboost(data = gpb_data, + params = param_gpboost, + verbose = -1, + num_threads = 0) +gpbtree <- gpboost::gpb.model.dt.tree(gpb_fifa) + +test_that('gpboost.unify returns an object with correct attributes', { + unified_model <- gpboost.unify(gpb_fifa, sparse_data) + + expect_equal(attr(unified_model, "missing_support"), TRUE) + expect_equal(attr(unified_model, "model"), "gpboost") +}) + +test_that('Columns after gpboost.unify are of appropriate type', { + unified_model <- gpboost.unify(gpb_fifa, sparse_data)$model + expect_true(is.integer(unified_model$Tree)) + expect_true(is.character(unified_model$Feature)) + expect_true(is.factor(unified_model$Decision.type)) + expect_true(is.numeric(unified_model$Split)) + expect_true(is.integer(unified_model$Yes)) + expect_true(is.integer(unified_model$No)) + expect_true(is.integer(unified_model$Missing)) + expect_true(is.numeric(unified_model$Prediction)) + expect_true(is.numeric(unified_model$Cover)) +}) + +test_that('gpboost.unify creates an object of the appropriate class', { + expect_true(is.model_unified(gpboost.unify(gpb_fifa, sparse_data))) + expect_true(is.model_unified(unify(gpb_fifa, sparse_data))) +}) + +test_that('basic columns after gpboost.unify are correct', { + unified_model <- gpboost.unify(gpb_fifa, sparse_data)$model + expect_equal(gpbtree$tree_index, unified_model$Tree) + to_test_features <- gpbtree[order(gpbtree$split_index), .(split_feature,split_index, threshold, leaf_count, internal_count),tree_index] + expect_equal(to_test_features[!is.na(to_test_features$split_index),][['split_index']], unified_model[!is.na(unified_model$Feature),][['Node']]) + expect_equal(to_test_features[['split_feature']], unified_model[['Feature']]) + expect_equal(to_test_features[['threshold']], unified_model[['Split']]) + expect_equal(to_test_features[!is.na(internal_count),][['internal_count']], unified_model[!is.na(unified_model$Feature),][['Cover']]) +}) + +test_that('connections between nodes and leaves after gpboost.unify are correct', { + test_object <- as.data.table(gpboost.unify(gpb_fifa, sparse_data)$model) + #Check if the sums of children's covers are correct + expect_equal(test_object[test_object[!is.na(test_object$Yes)][['Yes']]][['Cover']] + + test_object[test_object[!is.na(test_object$No)][['No']]][['Cover']], test_object[!is.na(Feature)][['Cover']]) + #check if default_left information is correctly used + df_default_left <- gpbtree[default_left == "TRUE", c('tree_index', 'split_index')] + test_object_actual_default_left <- test_object[Yes == Missing, c('Tree', 'Node')] + colnames(test_object_actual_default_left) <- c('tree_index', 'split_index') + attr(test_object_actual_default_left, 'model') <- NULL + expect_equal(test_object_actual_default_left[order(tree_index, split_index)], df_default_left[order(tree_index, split_index)]) + #and default_left = FALSE analogically: + df_default_right <- gpbtree[default_left != 'TRUE', c('tree_index', 'split_index')] + test_object_actual_default_right <- test_object[No == Missing, c('Tree', 'Node')] + colnames(test_object_actual_default_right) <- c('tree_index', 'split_index') + attr(test_object_actual_default_right, 'model') <- NULL + expect_equal(test_object_actual_default_right[order(tree_index, split_index)], df_default_right[order(tree_index, split_index)]) + #One more test with checking the usage of 'decision_type' column needed +}) + +# Function that return the predictions for sample observations indicated by vector contatining values -1, 0, 1, where -1 means +# going to the 'Yes' Node, 1 - to the 'No' node and 0 - to the missing node. The vectors are randomly produced during executing +# the function and should be passed to prepare_original_preds_ to save the conscistence. Later we can compare the 'predicted' values +prepare_test_preds <- function(unify_out){ + stopifnot(all(c("Tree", "Node", "Feature", "Split", "Yes", "No", "Missing", "Prediction", "Cover") %in% colnames(unify_out))) + test_tree <- unify_out[unify_out$Tree %in% 0:9,] + test_tree[['node_row_id']] <- seq_len(nrow(test_tree)) + test_obs <- lapply(table(test_tree$Tree), function(y) sample(c(-1, 0, 1), y, replace = T)) + test_tree <- split(test_tree, test_tree$Tree) + determine_val <- function(obs, tree){ + root_id <- tree[['node_row_id']][1] + tree[,c('Yes', 'No', 'Missing')] <- tree[,c('Yes', 'No', 'Missing')] - root_id + 1 + i <- 1 + indx <- 1 + while(!is.na(tree$Feature[indx])) { + indx <- ifelse(obs[i] == 0, tree$Missing[indx], ifelse(obs[i] < 0, tree$Yes[indx], tree$No[indx])) + #if(length(is.na(tree$Feature[indx]))>1) {print(paste(indx, i)); print(tree); print(obs)} + i <- i + 1 + } + return(tree[['Prediction']][indx]) + } + x = numeric() + for(i in seq_along(test_obs)) { + x[i] <- determine_val(test_obs[[i]], test_tree[[i]]) + + } + return(list(preds = x, test_obs = test_obs)) +} + +prepare_original_preds_gpb <- function(orig_tree, test_obs){ + test_tree <- orig_tree[orig_tree$tree_index %in% 0:9,] + test_tree <- split(test_tree, test_tree$tree_index) + stopifnot(length(test_tree) == length(test_obs)) + determine_val <- function(obs, tree){ + i <- 1 + indx <- 1 + while(!is.na(tree$split_feature[indx])) { + children <- ifelse(is.na(tree$node_parent), tree$leaf_parent, tree$node_parent) + if((obs[i] < 0) | (tree$default_left[indx] == 'TRUE' & obs[i] == 0)){ + indx <- which(tree$split_index[indx] == children)[1] + } + else if((obs[i] > 0) | (tree$default_left[indx] == 'FALSE' & obs[i] == 0)) { + indx <- which(tree$split_index[indx] == children)[2] + } + else{ + stop('Error in the connections') + indx <- 0 + } + i <- i + 1 + } + return(tree[['leaf_value']][indx]) + } + y = numeric() + for(i in seq_along(test_obs)) { + y[i] <- determine_val(test_obs[[i]], test_tree[[i]]) + } + return(y) +} + +test_that('the connections between the nodes are correct', { + # The test is passed only if the predictions for sample observations are equal in the first 10 trees of the ensemble + x <- prepare_test_preds(gpboost.unify(gpb_fifa, sparse_data)$model) + preds <- x[['preds']] + test_obs <- x[['test_obs']] + original_preds <- prepare_original_preds_gpb(gpbtree, test_obs) + expect_equal(preds, original_preds) +}) + +test_that("gpboost: predictions from unified == original predictions", { + unifier <- gpboost.unify(gpb_fifa, sparse_data) + obs <- c(1:16000) + original <- stats::predict(gpb_fifa, sparse_data[obs, ]) + from_unified <- predict(unifier, sparse_data[obs, ]) + expect_equal(from_unified, original) + #expect_true(all(abs((from_unified - original) / original) < 10**(-14))) #not needed +}) + +test_that("gpboost: mean prediction calculated using predict == using covers", { + unifier <- gpboost.unify(gpb_fifa, sparse_data) + + intercept_predict <- mean(predict(unifier, sparse_data)) + + ntrees <- sum(unifier$model$Node == 0) + leaves <- unifier$model[is.na(unifier$model$Feature), ] + intercept_covers <- sum(leaves$Prediction * leaves$Cover) / sum(leaves$Cover) * ntrees + + #expect_true(all(abs((intercept_predict - intercept_covers) / intercept_predict) < 10**(-14))) + expect_equal(intercept_predict, intercept_covers) +}) + +test_that("gpboost: covers correctness", { + unifier <- gpboost.unify(gpb_fifa, sparse_data) + + roots <- unifier$model[unifier$model$Node == 0, ] + expect_true(all(roots$Cover == nrow(sparse_data))) + + internals <- unifier$model[!is.na(unifier$model$Feature), ] + yes_child_cover <- unifier$model[internals$Yes, ]$Cover + no_child_cover <- unifier$model[internals$No, ]$Cover + if (all(is.na(internals$Missing))) { + children_cover <- yes_child_cover + no_child_cover + } else { + missing_child_cover <- unifier$model[internals$Missing, ]$Cover + missing_child_cover[is.na(missing_child_cover)] <- 0 + missing_child_cover[internals$Missing == internals$Yes | internals$Missing == internals$No] <- 0 + children_cover <- yes_child_cover + no_child_cover + missing_child_cover + } + expect_true(all(internals$Cover == children_cover)) +})