From 53259f54f09d1c9802769644e7e43ce9535f5f9d Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Tue, 3 Dec 2024 19:38:53 -0800 Subject: [PATCH] Update FIL tests to use XGBoost UBJSON instead of binary (#6153) Starting from 2.1, XGBoost uses UBJSON format to serialize models. Replace all uses of the legacy model format with UBJSON. Also make `xgboost` a test dependency of cuML so that the FIL tests run in the CI pipelines. Authors: - Philip Hyunsu Cho (https://github.com/hcho3) Approvers: - William Hicks (https://github.com/wphicks) - Robert Maynard (https://github.com/robertmaynard) - https://github.com/jakirkham URL: https://github.com/rapidsai/cuml/pull/6153 --- .../all_cuda-118_arch-x86_64.yaml | 1 + .../all_cuda-125_arch-x86_64.yaml | 1 + dependencies.yaml | 3 ++- .../cuml/tests/experimental/test_filex.py | 20 +++++++++---------- python/cuml/cuml/tests/test_fil.py | 12 +++++------ python/cuml/pyproject.toml | 1 + 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 48f91abaf7..df54cc05c6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -77,4 +77,5 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 +- xgboost>=2.1.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index d401902b41..ec38b5b51e 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -73,4 +73,5 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 +- xgboost>=2.1.0 name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 21f9361b57..e4e4ab141b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -523,6 +523,7 @@ dependencies: - pytest-xdist - seaborn - *scikit_learn + - &xgboost xgboost>=2.1.0 - statsmodels - umap-learn==0.5.6 - pynndescent @@ -537,4 +538,4 @@ dependencies: - pandas - *scikit_learn - seaborn - - xgboost + - *xgboost diff --git a/python/cuml/cuml/tests/experimental/test_filex.py b/python/cuml/cuml/tests/experimental/test_filex.py index 128d2fa512..1b26a939fe 100644 --- a/python/cuml/cuml/tests/experimental/test_filex.py +++ b/python/cuml/cuml/tests/experimental/test_filex.py @@ -163,7 +163,7 @@ def test_fil_classification( X, y, train_size=0.8, random_state=0 ) - model_path = os.path.join(tmp_path, "xgb_class.model") + model_path = os.path.join(tmp_path, "xgb_class.ubj") bst = _build_and_save_xgboost( model_path, @@ -258,7 +258,7 @@ def test_fil_regression( X, y, train_size=train_size, random_state=0 ) - model_path = os.path.join(tmp_path, "xgb_reg.model") + model_path = os.path.join(tmp_path, "xgb_reg.ubj") bst = _build_and_save_xgboost( model_path, X_train, @@ -490,12 +490,12 @@ def test_fil_skl_regression( np.testing.assert_allclose(fil_preds_opt, fil_preds, atol=1.2e-3) -@pytest.fixture(scope="session", params=["binary", "json"]) +@pytest.fixture(scope="session", params=["ubjson", "json"]) def small_classifier_and_preds(tmpdir_factory, request): X, y = simulate_data(500, 10, random_state=43210, classification=True) - ext = "json" if request.param == "json" else "model" - model_type = "xgboost_json" if request.param == "json" else "xgboost" + ext = "json" if request.param == "json" else "ubj" + model_type = "xgboost_json" if request.param == "json" else "xgboost_ubj" model_path = str( tmpdir_factory.mktemp("models").join(f"small_class.{ext}") ) @@ -738,7 +738,7 @@ def test_predict_per_tree( classification=True, ) - model_path = os.path.join(tmp_path, "xgb_class.model") + model_path = os.path.join(tmp_path, "xgb_class.ubj") xgboost_params = {"base_score": (0.5 if n_classes == 2 else 0.0)} bst = _build_and_save_xgboost( @@ -751,7 +751,7 @@ def test_predict_per_tree( xgboost_params=xgboost_params, ) fm = ForestInference.load(model_path, output_class=True) - tl_model = treelite.Model.from_xgboost(bst) + tl_model = treelite.frontend.from_xgboost(bst) pred_per_tree_tl = treelite.gtil.predict_per_tree(tl_model, X) with using_device_type(infer_device): @@ -773,7 +773,7 @@ def test_predict_per_tree( assert pred_per_tree.shape == expected_shape np.testing.assert_almost_equal(sum_by_class, margin_pred, decimal=3) np.testing.assert_almost_equal( - pred_per_tree, pred_per_tree_tl, decimal=3 + pred_per_tree.reshape((n_rows, -1, 1)), pred_per_tree_tl, decimal=3 ) np.testing.assert_almost_equal( pred_per_tree_opt, pred_per_tree, decimal=3 @@ -844,7 +844,7 @@ def test_apply(train_device, infer_device, n_classes, tmp_path): classification=True, ) - model_path = os.path.join(tmp_path, "xgb_class.model") + model_path = os.path.join(tmp_path, "xgb_class.ubj") xgboost_params = {"base_score": (0.5 if n_classes == 2 else 0.0)} bst = _build_and_save_xgboost( @@ -858,7 +858,7 @@ def test_apply(train_device, infer_device, n_classes, tmp_path): ) fm = ForestInference.load( - model_path, output_class=True, model_type="xgboost" + model_path, output_class=True, model_type="xgboost_ubj" ) with using_device_type(infer_device): diff --git a/python/cuml/cuml/tests/test_fil.py b/python/cuml/cuml/tests/test_fil.py index f6b47f09dc..a0e1b6c3f6 100644 --- a/python/cuml/cuml/tests/test_fil.py +++ b/python/cuml/cuml/tests/test_fil.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -155,7 +155,7 @@ def test_fil_classification( X, y, train_size=train_size, random_state=0 ) - model_path = os.path.join(tmp_path, "xgb_class.model") + model_path = os.path.join(tmp_path, "xgb_class.ubj") bst = _build_and_save_xgboost( model_path, @@ -226,7 +226,7 @@ def test_fil_regression(n_rows, n_columns, num_rounds, tmp_path, max_depth): X, y, train_size=train_size, random_state=0 ) - model_path = os.path.join(tmp_path, "xgb_reg.model") + model_path = os.path.join(tmp_path, "xgb_reg.ubj") bst = _build_and_save_xgboost( model_path, X_train, @@ -447,12 +447,12 @@ def test_fil_skl_regression( assert np.allclose(fil_preds, skl_preds, 1.2e-2) -@pytest.fixture(scope="session", params=["binary", "json"]) +@pytest.fixture(scope="session", params=["ubjson", "json"]) def small_classifier_and_preds(tmpdir_factory, request): X, y = simulate_data(500, 10, random_state=43210, classification=True) - ext = "json" if request.param == "json" else "model" - model_type = "xgboost_json" if request.param == "json" else "xgboost" + ext = "json" if request.param == "json" else "ubj" + model_type = "xgboost_json" if request.param == "json" else "xgboost_ubj" model_path = str( tmpdir_factory.mktemp("models").join(f"small_class.{ext}") ) diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index 6561a0dbb6..4f64be6233 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -137,6 +137,7 @@ test = [ "seaborn", "statsmodels", "umap-learn==0.5.6", + "xgboost>=2.1.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls]