From 24e19e781366565ae392432a0b5d0f3a24ce46e0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 19 Dec 2024 12:23:39 +0800 Subject: [PATCH] Add datagen for testing string-based categorical data. (#11114) --- python-package/xgboost/data.py | 6 +- python-package/xgboost/testing/__init__.py | 84 +---------- python-package/xgboost/testing/data.py | 135 +++++++++++++++--- .../xgboost/testing/quantile_dmatrix.py | 35 ++++- .../test_device_quantile_dmatrix.py | 8 +- tests/python-gpu/test_from_cudf.py | 4 +- tests/python/test_data_iterator.py | 4 +- tests/python/test_quantile_dmatrix.py | 8 +- 8 files changed, 176 insertions(+), 108 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index d49ff5e43899..9ce2b6b17e1c 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -995,11 +995,7 @@ def _from_uri( _warn_unused_missing(data, missing) handle = ctypes.c_void_p() data = os.fspath(os.path.expanduser(data)) - args = { - "uri": str(data), - "data_split_mode": int(data_split_mode), - } - config = bytes(json.dumps(args), "utf-8") + config = make_jcargs(uri=str(data), data_split_mode=int(data_split_mode)) _check_call(_LIB.XGDMatrixCreateFromURI(config, ctypes.byref(handle))) return handle, feature_names, feature_types diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 53779403917b..29feac569b61 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -37,20 +37,19 @@ import xgboost as xgb from xgboost import RabitTracker from xgboost.core import ArrayLike -from xgboost.data import is_pd_cat_dtype from xgboost.sklearn import SklObjective -from xgboost.testing.data import ( + +from .._typing import PathLike +from .data import ( get_california_housing, get_cancer, get_digits, get_sparse, make_batches, + make_categorical, make_sparse_regression, - memory, ) -from .._typing import PathLike - hypothesis = pytest.importorskip("hypothesis") # pylint:disable=wrong-import-position,wrong-import-order @@ -377,81 +376,6 @@ def __repr__(self) -> str: return self.name -# pylint: disable=too-many-arguments,too-many-locals -@memory.cache -def make_categorical( - n_samples: int, - n_features: int, - n_categories: int, - *, - onehot: bool, - sparsity: float = 0.0, - cat_ratio: float = 1.0, - shuffle: bool = False, - random_state: int = 1994, -) -> Tuple[ArrayLike, np.ndarray]: - """Generate categorical features for test. - - Parameters - ---------- - n_categories: - Number of categories for categorical features. - onehot: - Should we apply one-hot encoding to the data? - sparsity: - The ratio of the amount of missing values over the number of all entries. - cat_ratio: - The ratio of features that are categorical. - shuffle: - Whether we should shuffle the columns. - - Returns - ------- - X, y - """ - import pandas as pd - - rng = np.random.RandomState(random_state) - - pd_dict = {} - for i in range(n_features + 1): - c = rng.randint(low=0, high=n_categories, size=n_samples) - pd_dict[str(i)] = pd.Series(c, dtype=np.int64) - - df = pd.DataFrame(pd_dict) - label = df.iloc[:, 0] - df = df.iloc[:, 1:] - for i in range(0, n_features): - label += df.iloc[:, i] - label += 1 - - categories = np.arange(0, n_categories) - for col in df.columns: - if rng.binomial(1, cat_ratio, size=1)[0] == 1: - df[col] = df[col].astype("category") - df[col] = df[col].cat.set_categories(categories) - - if sparsity > 0.0: - for i in range(n_features): - index = rng.randint( - low=0, high=n_samples - 1, size=int(n_samples * sparsity) - ) - df.iloc[index, i] = np.nan - if is_pd_cat_dtype(df.dtypes.iloc[i]): - assert n_categories == np.unique(df.dtypes.iloc[i].categories).size - - assert df.shape[1] == n_features - if onehot: - df = pd.get_dummies(df) - - if shuffle: - columns = list(df.columns) - rng.shuffle(columns) - df = df[columns] - - return df, label - - def make_ltr( n_samples: int, n_features: int, diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py index 34f55c077a85..4ad5915aad88 100644 --- a/python-package/xgboost/testing/data.py +++ b/python-package/xgboost/testing/data.py @@ -1,7 +1,8 @@ -# pylint: disable=invalid-name +# pylint: disable=invalid-name, too-many-lines """Utilities for data generation.""" import multiprocessing import os +import string import zipfile from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass @@ -14,6 +15,7 @@ List, NamedTuple, Optional, + Set, Tuple, Type, Union, @@ -26,8 +28,10 @@ from numpy.random import Generator as RNG from scipy import sparse -import xgboost -from xgboost.data import pandas_pyarrow_mapper +from ..core import DMatrix, QuantileDMatrix +from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper +from ..sklearn import ArrayLike, XGBRanker +from ..training import train as train_fn if TYPE_CHECKING: from ..compat import DataFrame as DataFrameT @@ -42,7 +46,7 @@ def np_dtypes( n_samples: int, n_features: int ) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]: """Enumerate all supported dtypes from numpy.""" - import pandas as pd + pd = pytest.importorskip("pandas") rng = np.random.RandomState(1994) # Integer and float. @@ -99,7 +103,7 @@ def np_dtypes( def pd_dtypes() -> Generator: """Enumerate all supported pandas extension types.""" - import pandas as pd + pd = pytest.importorskip("pandas") # Integer dtypes = [ @@ -162,8 +166,8 @@ def pd_dtypes() -> Generator: def pd_arrow_dtypes() -> Generator: """Pandas DataFrame with pyarrow backed type.""" - import pandas as pd - import pyarrow as pa + pd = pytest.importorskip("pandas") + pa = pytest.importorskip("pyarrow") # Integer dtypes = pandas_pyarrow_mapper @@ -225,10 +229,10 @@ def check_inf(rng: RNG) -> None: X[5, 2] = np.inf with pytest.raises(ValueError, match="Input data contains `inf`"): - xgboost.QuantileDMatrix(X, y) + QuantileDMatrix(X, y) with pytest.raises(ValueError, match="Input data contains `inf`"): - xgboost.DMatrix(X, y) + DMatrix(X, y) @memory.cache @@ -288,8 +292,10 @@ def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]: Number of categorical features: 10 Number of numerical features: 10 """ - pytest.importorskip("pandas") - import pandas as pd + if TYPE_CHECKING: + import pandas as pd + else: + pd = pytest.importorskip("pandas") rng = np.random.default_rng(1994) n_samples = 1460 @@ -664,7 +670,7 @@ def init_rank_score( y_train = y_train[sorted_idx] qid_train = qid_train[sorted_idx] - ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist") + ltr = XGBRanker(objective="rank:ndcg", tree_method="hist") ltr.fit(X_train, y_train, qid=qid_train) # Use the original order of the data. @@ -799,9 +805,7 @@ def sort_ltr_samples( return data -def run_base_margin_info( - DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str -) -> None: +def run_base_margin_info(DType: Callable, DMatrixT: Type[DMatrix], device: str) -> None: """Run tests for base margin.""" rng = np.random.default_rng() X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2)) @@ -814,7 +818,7 @@ def run_base_margin_info( Xy = DMatrixT(X, y, base_margin=base_margin) # Error at train, caused by check in predictor. with pytest.raises(ValueError, match=r".*base_margin.*"): - xgboost.train({"tree_method": "hist", "device": device}, Xy) + train_fn({"tree_method": "hist", "device": device}, Xy) if not hasattr(X, "iloc"): # column major matrix @@ -932,3 +936,102 @@ def random_csc(t_id: int) -> sparse.csc_matrix: return arr, y return csr, y + + +def unique_random_strings(n_strings: int, seed: int) -> List[str]: + """Generate n unique strings.""" + name_len = 8 # hardcoded, should be more than enough + unique_strings: Set[str] = set() + rng = np.random.default_rng(seed) + + while len(unique_strings) < n_strings: + random_str = "".join( + rng.choice(list(string.ascii_letters), size=name_len, replace=True) + ) + unique_strings.add(random_str) + + return list(unique_strings) + + +# pylint: disable=too-many-arguments,too-many-locals,too-many-branches +def make_categorical( + n_samples: int, + n_features: int, + n_categories: int, + *, + onehot: bool, + sparsity: float = 0.0, + cat_ratio: float = 1.0, + shuffle: bool = False, + random_state: int = 1994, + cat_dtype: np.typing.DTypeLike = np.int64, +) -> Tuple[ArrayLike, np.ndarray]: + """Generate categorical features for test. + + Parameters + ---------- + n_categories: + Number of categories for categorical features. + onehot: + Should we apply one-hot encoding to the data? + sparsity: + The ratio of the amount of missing values over the number of all entries. + cat_ratio: + The ratio of features that are categorical. + shuffle: + Whether we should shuffle the columns. + cat_dtype : + The dtype for categorical features, might be string or numeric. + + Returns + ------- + X, y + """ + pd = pytest.importorskip("pandas") + + rng = np.random.RandomState(random_state) + + df = pd.DataFrame() + for i in range(n_features): + choice = rng.binomial(1, cat_ratio, size=1)[0] + if choice == 1: + if np.issubdtype(cat_dtype, np.str_): + categories = np.array(unique_random_strings(n_categories, i)) + c = rng.choice(categories, size=n_samples, replace=True) + else: + categories = np.arange(0, n_categories) + c = rng.randint(low=0, high=n_categories, size=n_samples) + + df[str(i)] = pd.Series(c, dtype="category") + df[str(i)] = df[str(i)].cat.set_categories(categories) + else: + num = rng.randint(low=0, high=n_categories, size=n_samples) + df[str(i)] = pd.Series(num, dtype=num.dtype) + + label = np.zeros(shape=(n_samples,)) + for col in df.columns: + if isinstance(df[col].dtype, pd.CategoricalDtype): + label += df[col].cat.codes + else: + label += df[col] + label += 1 + + if sparsity > 0.0: + for i in range(n_features): + index = rng.randint( + low=0, high=n_samples - 1, size=int(n_samples * sparsity) + ) + df.iloc[index, i] = np.nan + if is_pd_cat_dtype(df.dtypes.iloc[i]): + assert n_categories == np.unique(df.dtypes.iloc[i].categories).size + + assert df.shape[1] == n_features + if onehot: + df = pd.get_dummies(df) + + if shuffle: + columns = list(df.columns) + rng.shuffle(columns) + df = df[columns] + + return df, label diff --git a/python-package/xgboost/testing/quantile_dmatrix.py b/python-package/xgboost/testing/quantile_dmatrix.py index b06cb550198f..f5e861c50098 100644 --- a/python-package/xgboost/testing/quantile_dmatrix.py +++ b/python-package/xgboost/testing/quantile_dmatrix.py @@ -1,11 +1,12 @@ """QuantileDMatrix related tests.""" import numpy as np +import pytest from sklearn.model_selection import train_test_split import xgboost as xgb -from .data import make_batches +from .data import make_batches, make_categorical def check_ref_quantile_cut(device: str) -> None: @@ -33,3 +34,35 @@ def check_ref_quantile_cut(device: str) -> None: Xy_valid = xgb.QuantileDMatrix(X_valid, y_valid) cut_valid = Xy_valid.get_quantile_cut() assert not np.allclose(cut_train[1], cut_valid[1]) + + +def check_categorical_strings(device: str) -> None: + """Check string inputs.""" + if device == "cpu": + pd = pytest.importorskip("pandas") + else: + pd = pytest.importorskip("cudf") + + n_categories = 32 + X, y = make_categorical( + 1024, + 8, + n_categories, + onehot=False, + cat_dtype=np.str_, + cat_ratio=0.5, + shuffle=True, + ) + X = pd.DataFrame(X) + + Xy = xgb.QuantileDMatrix(X, y, enable_categorical=True) + assert Xy.num_col() == 8 + cuts = Xy.get_quantile_cut() + indptr = cuts[0] + values = cuts[1] + for i in range(1, len(indptr)): + f_idx = i - 1 + if isinstance(X[X.columns[f_idx]].dtype, pd.CategoricalDtype): + beg, end = indptr[f_idx], indptr[i] + col = values[beg:end] + np.testing.assert_allclose(col, np.arange(0, n_categories)) diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py index 2f2e6545bf2d..5e3bd79625e5 100644 --- a/tests/python-gpu/test_device_quantile_dmatrix.py +++ b/tests/python-gpu/test_device_quantile_dmatrix.py @@ -8,7 +8,10 @@ from xgboost import testing as tm from xgboost.testing.data import check_inf from xgboost.testing.data_iter import run_mixed_sparsity -from xgboost.testing.quantile_dmatrix import check_ref_quantile_cut +from xgboost.testing.quantile_dmatrix import ( + check_categorical_strings, + check_ref_quantile_cut, +) sys.path.append("tests/python") import test_quantile_dmatrix as tqd @@ -33,6 +36,9 @@ def test_dmatrix_feature_weights(self) -> None: feature_weights.astype(np.float32), ) + def test_categorical_strings(self) -> None: + check_categorical_strings("cuda") + @pytest.mark.skipif(**tm.no_cupy()) def test_dmatrix_cupy_init(self) -> None: import cupy as cp diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index 37826f35cc34..0d138f14b29a 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -210,8 +210,8 @@ def test_cudf_categorical(self) -> None: assert all(t == "c" for t in Xy.feature_types) # mixed dtypes - X["1"] = X["1"].astype(np.int64) - X["3"] = X["3"].astype(np.int64) + X["0"] = X["0"].astype(np.int64) + X["2"] = X["2"].astype(np.int64) df, cat_codes, _, _ = xgb.data._transform_cudf_df( X, None, None, enable_categorical=True ) diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index 545b849b4bdb..6a980f967a97 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -259,13 +259,13 @@ def test_cat_check() -> None: batches = [] for i in range(n_batches): - X, y = tm.make_categorical( + X_df, y_arr = tm.make_categorical( n_samples=n_samples_per_batch, n_features=n_features, n_categories=3, onehot=False, ) - batches.append((X, y)) + batches.append((X_df, y_arr)) X, y = list(zip(*batches)) it = tm.IteratorForTest(X, y, None, cache=None, on_host=False) diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py index e1152370732b..e64212265212 100644 --- a/tests/python/test_quantile_dmatrix.py +++ b/tests/python/test_quantile_dmatrix.py @@ -17,7 +17,10 @@ ) from xgboost.testing.data import check_inf, np_dtypes from xgboost.testing.data_iter import run_mixed_sparsity -from xgboost.testing.quantile_dmatrix import check_ref_quantile_cut +from xgboost.testing.quantile_dmatrix import ( + check_categorical_strings, + check_ref_quantile_cut, +) class TestQuantileDMatrix: @@ -57,6 +60,9 @@ def test_basic(self) -> None: r = np.arange(1.0, n_samples) np.testing.assert_allclose(Xy.get_data().toarray()[1:, 0], r) + def test_categorical_strings(self) -> None: + check_categorical_strings("cpu") + def test_error(self): from sklearn.model_selection import train_test_split