diff --git a/CHANGES.rst b/CHANGES.rst index 835e7770b..be3829e42 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -30,6 +30,11 @@ New features - :meth:`DataOp.skb.full_report` now accepts a new parameter, title, that is displayed in the html report. :pr:`1654` by :user:`Marie Sacksick `. +- :class:`DropUninformative` now replaces columns with many null values (but not + entirely null) with missing indicator columns instead of dropping them, preserving + information about whether values were present while avoiding spending feature + dimensions on encoding the actual values. Updated in :pr:`1723` by + :user:`Hanh Tran `. Changes ------- diff --git a/skrub/_data_ops/_choosing.py b/skrub/_data_ops/_choosing.py index d46892fd6..3a1c2e5b1 100644 --- a/skrub/_data_ops/_choosing.py +++ b/skrub/_data_ops/_choosing.py @@ -109,8 +109,8 @@ def __repr__(self): class Choice(BaseChoice): """A choice among an enumerated set of outcomes.""" - outcomes: list[typing.Any] - outcome_names: typing.Optional[list[str]] + outcomes: typing.List[typing.Any] + outcome_names: typing.Optional[typing.List[str]] name: typing.Optional[str] = None chosen_outcome_idx: typing.Optional[int] = None diff --git a/skrub/_drop_uninformative.py b/skrub/_drop_uninformative.py index 3c0e580c0..603547ad6 100644 --- a/skrub/_drop_uninformative.py +++ b/skrub/_drop_uninformative.py @@ -14,6 +14,12 @@ class DropUninformative(SingleColumnTransformer): Columns are considered "uninformative" if the fraction of missing values is larger than a threshold, if they contain one unique value, or if all values are unique. + When a column has too many null values (but not entirely null), it is replaced + by a missing indicator column instead of being dropped. This preserves information + about whether values were present or not while avoiding spending feature dimensions + on encoding the actual values. Columns that are entirely null are still dropped + (as the missing indicator would be constant). + Parameters ---------- drop_if_constant : bool, default=False @@ -26,8 +32,14 @@ class DropUninformative(SingleColumnTransformer): lead to dropping columns that contain free-flowing text. drop_null_fraction : float or None, default=1.0 - Drop columns with a fraction of missing values larger than threshold. If None, - keep the column even if all its values are missing. + Fraction of missing values above which the column is replaced by a missing + indicator (instead of being dropped), or dropped if entirely null. If + ``drop_null_fraction`` is set to ``1.0``, only entirely null columns are + dropped. If ``drop_null_fraction`` is a number in ``[0.0, 1.0)``, columns + with a fraction of nulls strictly larger than this threshold are replaced by + a missing indicator (or dropped if entirely null). If ``drop_null_fraction`` + is ``None``, this check is disabled: no columns are replaced or dropped based + on the number of null values they contain. See Also -------- @@ -41,8 +53,10 @@ class DropUninformative(SingleColumnTransformer): A column is considered to be "uninformative" if one or more of the following issues are found: - - The fraction of missing values is larger than a certain fraction (by default, - all values must be null for the column to be dropped). + - The fraction of missing values is larger than a certain fraction. In this case, + if the column is entirely null, it is dropped. Otherwise, it is replaced by + a missing indicator column (a boolean/float32 column indicating positions + where values were missing). - The column includes only one unique value (the column is constant). Missing values are considered a separate value. - The number of unique values in the column is equal to the length of the @@ -56,19 +70,44 @@ class DropUninformative(SingleColumnTransformer): >>> import pandas as pd >>> df = pd.DataFrame({"col1": [None, None, None]}) - By default, only null columns are dropped: + By default, only entirely null columns are dropped. Columns with some nulls + are kept as-is: >>> du = DropUninformative() >>> du.fit_transform(df["col1"]) [] - It is also possible to drop constant columns, or specify a lower null fraction - threshold: + >>> df = pd.DataFrame({"col1": [1, None, None, None]}) + >>> result = du.fit_transform(df["col1"]) + >>> result + 0 1.0 + 1 NaN + 2 NaN + 3 NaN + Name: col1, dtype: float64 + + Columns with many nulls (but not entirely null) are replaced by missing indicators + when using a lower threshold: + + >>> df = pd.DataFrame({"col1": [1, None, None, None]}) + >>> du = DropUninformative(drop_null_fraction=0.5) + >>> result = du.fit_transform(df["col1"]) + >>> result + 0 0.0 + 1 1.0 + 2 1.0 + 3 1.0 + Name: col1, dtype: float32 + + With a lower threshold, more columns are replaced: >>> df = pd.DataFrame({"col1": [1, 2, None], "col2": ["const", "const", "const"]}) >>> du = DropUninformative(drop_if_constant=True, drop_null_fraction=0.1) >>> du.fit_transform(df["col1"]) - [] + 0 0.0 + 1 0.0 + 2 1.0 + Name: col1, dtype: float32 >>> du.fit_transform(df["col2"]) [] @@ -111,13 +150,32 @@ def _check_params(self): " should be a number in the range [0, 1], or None." ) + def _should_replace_with_missing_indicator(self, column): + """Check if column should be replaced with missing indicator + instead of dropped.""" + if self.drop_null_fraction is None: + return False + if self.drop_null_fraction == 1.0: + # With default threshold, only drop entirely null columns, + # don't replace others + return False + # For other thresholds, replace if fraction exceeds threshold + # but not entirely null + if self._null_count == 0: + return False + null_fraction = self._null_count / len(column) + return null_fraction > self.drop_null_fraction and self._null_count < len( + column + ) + def _drop_if_too_many_nulls(self, column): if self.drop_null_fraction == 1.0: return self._null_count == len(column) # No nulls found, or no threshold if self._null_count == 0 or self.drop_null_fraction is None: return False - return self._null_count / len(column) > self.drop_null_fraction + # Only drop if entirely null (otherwise we replace with missing indicator) + return self._null_count == len(column) def _drop_if_constant(self, column): if self.drop_if_constant: @@ -147,8 +205,8 @@ def fit_transform(self, column, y=None): Returns ------- column - The input column, or an empty list if the column is chosen to be - dropped. + The input column, a missing indicator column (boolean), or an empty list + if the column is chosen to be dropped. """ del y @@ -157,6 +215,10 @@ def fit_transform(self, column, y=None): # Count nulls self._null_count = sum(sbd.is_null(column)) + self.replace_with_indicator_ = self._should_replace_with_missing_indicator( + column + ) + self.drop_ = any( check(column) for check in [ @@ -166,7 +228,12 @@ def fit_transform(self, column, y=None): ] ) - self.all_outputs_ = [] if self.drop_ else [sbd.name(column)] + if self.replace_with_indicator_: + # Store original column name for the missing indicator + self.original_column_name_ = sbd.name(column) + self.all_outputs_ = [sbd.name(column)] + else: + self.all_outputs_ = [] if self.drop_ else [sbd.name(column)] return self.transform(column) @@ -181,11 +248,23 @@ def transform(self, column): Returns ------- column - The input column, or an empty list if the column is chosen to be - dropped. + The input column, a missing indicator column (boolean), or an empty list + if the column is chosen to be dropped. """ check_is_fitted(self, "all_outputs_") if self.drop_: return [] + + if self.replace_with_indicator_: + # Return a boolean column indicating missing values + # (1.0 for missing, 0.0 for present) + missing_mask = sbd.is_null(column) + # Convert boolean to float32 column + missing_indicator_values = sbd.to_float32(missing_mask) + missing_indicator = sbd.make_column_like( + column, missing_indicator_values, sbd.name(column) + ) + return missing_indicator + return column diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 3f48d5b38..872fd5d0d 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -162,12 +162,14 @@ class Cleaner(TransformerMixin, BaseEstimator): Parameters ---------- drop_null_fraction : float or None, default=1.0 - Fraction of null above which the column is dropped. If ``drop_null_fraction`` - is set to ``1.0``, the column is dropped if it contains only - nulls or NaNs (this is the default behavior). If ``drop_null_fraction`` is a - number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls - is strictly larger than ``drop_null_fraction``. If ``drop_null_fraction`` is - ``None``, this selection is disabled: no columns are dropped based on the + Fraction of null above which the column is replaced by a missing indicator + (instead of being dropped), or dropped if entirely null. If + ``drop_null_fraction`` is set to ``1.0``, only entirely null columns are + dropped (this is the default behavior). If ``drop_null_fraction`` is a + number in ``[0.0, 1.0)``, columns with a fraction of nulls strictly larger + than ``drop_null_fraction`` are replaced by a missing indicator (or + dropped if entirely null). If ``drop_null_fraction`` is ``None``, this + selection is disabled: no columns are replaced or dropped based on the number of null values they contain. drop_if_constant : bool, default=False @@ -216,9 +218,11 @@ class Cleaner(TransformerMixin, BaseEstimator): with NA markers. - ``DropUninformative()``: drop the column if it is considered to be - "uninformative". A column is considered to be "uninformative" if it contains - only missing values (``drop_null_fraction``), only a constant value - (``drop_if_constant``), or if all values are distinct (``drop_if_unique``). + "uninformative", or replace it with a missing indicator. A column is considered + to be "uninformative" if it contains only missing values (``drop_null_fraction``), + only a constant value (``drop_if_constant``), or if all values are distinct + (``drop_if_unique``). When a column has too many null values (but not entirely + null), it is replaced by a missing indicator column instead of being dropped. By default, the ``Cleaner`` keeps all columns, unless they contain only missing values. Note that setting ``drop_if_unique`` to ``True`` may lead to dropping columns @@ -447,12 +451,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator): :class:`~sklearn.compose.ColumnTransformer`. drop_null_fraction : float or None, default=1.0 - Fraction of null above which the column is dropped. If `drop_null_fraction` is - set to ``1.0``, the column is dropped if it contains only - nulls or NaNs (this is the default behavior). If `drop_null_fraction` is a - number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls - is strictly larger than `drop_null_fraction`. If `drop_null_fraction` is ``None``, - this selection is disabled: no columns are dropped based on the number + Fraction of null above which the column is replaced by a missing indicator + (instead of being dropped), or dropped if entirely null. If `drop_null_fraction` is + set to ``1.0``, only entirely null columns are dropped (this is the default + behavior). If `drop_null_fraction` is a number in ``[0.0, 1.0)``, columns + with a fraction of nulls strictly larger than `drop_null_fraction` are replaced + by a missing indicator (or dropped if entirely null). If `drop_null_fraction` is ``None``, + this selection is disabled: no columns are replaced or dropped based on the number of null values they contain. drop_if_constant : bool, default=False @@ -625,7 +630,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Before applying the main transformer, the ``TableVectorizer`` applies several preprocessing steps, for example to detect numbers or dates that are represented as strings. By default, columns that contain only null values are - dropped. Moreover, a final post-processing step is applied to all + dropped. Columns with many nulls (but not entirely null) are replaced by missing + indicator columns instead of being dropped, preserving information about whether + values were present or not. Moreover, a final post-processing step is applied to all non-categorical columns in the encoder's output to cast them to float32. If ``datetime_format`` is provided, it will be used to parse all datetime columns. diff --git a/skrub/datasets/_ken_embeddings.py b/skrub/datasets/_ken_embeddings.py index 0dbb28067..aa293e863 100644 --- a/skrub/datasets/_ken_embeddings.py +++ b/skrub/datasets/_ken_embeddings.py @@ -4,6 +4,7 @@ # Required for ignoring lines too long in the docstrings # flake8: noqa: E501 +import typing import urllib.request import warnings from dataclasses import dataclass @@ -58,7 +59,7 @@ class DatasetAll: X: pd.DataFrame y: pd.Series path: Path - read_csv_kwargs: dict[str] + read_csv_kwargs: typing.Dict[str, typing.Any] def __eq__(self, other): """ @@ -96,7 +97,7 @@ class DatasetInfoOnly: source: str target: str path: Path - read_csv_kwargs: dict[str] + read_csv_kwargs: typing.Dict[str, typing.Any] def fetch_figshare( diff --git a/skrub/tests/test_drop_uninformative.py b/skrub/tests/test_drop_uninformative.py index 161df46c8..ba05cf15c 100644 --- a/skrub/tests/test_drop_uninformative.py +++ b/skrub/tests/test_drop_uninformative.py @@ -61,8 +61,10 @@ def drop_null_table(df_module): (dict(drop_null_fraction=0.5), "idx", [1, 2, 3]), (dict(drop_null_fraction=0.5), "value_nan", []), (dict(drop_null_fraction=0.5), "value_null", []), - (dict(drop_null_fraction=0.5), "value_almost_nan", []), - (dict(drop_null_fraction=0.5), "value_almost_null", []), + # value_almost_nan: 2/3 nulls (66.7% > 0.5), replaced with missing indicator + (dict(drop_null_fraction=0.5), "value_almost_nan", [0.0, 1.0, 1.0]), + # value_almost_null: 2/3 nulls (66.7% > 0.5), replaced with missing indicator + (dict(drop_null_fraction=0.5), "value_almost_null", [0.0, 1.0, 1.0]), (dict(drop_null_fraction=0.5), "value_mostly_not_nan", [2.5, 2.5, np.nan]), ( dict(drop_null_fraction=0.5), @@ -77,7 +79,26 @@ def test_drop_nulls(df_module, drop_null_table, params, column, result): if result == []: assert res == result else: - df_module.assert_column_equal(res, df_module.make_column(column, result)) + # For missing indicators (float32), check values directly + if ( + params.get("drop_null_fraction", 1.0) != 1.0 + and params.get("drop_null_fraction") is not None + and isinstance(result, list) + and all(isinstance(x, (int, float)) and 0 <= x <= 1 for x in result) + ): + # This is a missing indicator - check values match + res_values = sbd.to_list(res) + assert len(res_values) == len(result) + for r_val, expected_val in zip(res_values, result): + assert abs(r_val - expected_val) < 1e-6 + # Check dtype is float32 + dtype_obj = sbd.dtype(res) + dtype_name = getattr(dtype_obj, "name", None) + assert (dtype_name is not None and dtype_name.lower() == "float32") or str( + dtype_obj + ).lower() == "float32" + else: + df_module.assert_column_equal(res, df_module.make_column(column, result)) def test_do_not_drop_nulls(df_module, drop_null_table): diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index f802a1b8e..83e44fe42 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -975,6 +975,42 @@ def test_drop_null_column(df_module): assert sbd.shape(transformed) == (sbd.shape(X)[0], 1) +def test_missing_indicator_replacement(df_module): + """Check that columns with too many nulls are replaced with missing indicators.""" + pytest.importorskip("pyarrow") + + # Create a dataframe with a column that has many nulls but not entirely null + X = df_module.make_dataframe( + { + "normal": [1, 2, 3, 4, 5], + "mostly_null": [1.0, np.nan, np.nan, np.nan, np.nan], # 80% null + "some_null": [1, 2, None, None, 5], # 40% null + } + ) + + # With threshold 0.5, mostly_null should be replaced with missing indicator + tv = TableVectorizer(drop_null_fraction=0.5) + transformed = tv.fit_transform(X) + + # Check that mostly_null was replaced with a missing indicator + assert "mostly_null" in sbd.column_names(transformed) + mostly_null_col = sbd.col(transformed, "mostly_null") + # Should be float32 with values [0.0, 1.0, 1.0, 1.0, 1.0] + dtype_obj = sbd.dtype(mostly_null_col) + dtype_str = str(dtype_obj) + assert ( + getattr(dtype_obj, "name", None) == "float32" or dtype_str.lower() == "float32" + ) + values = sbd.to_list(mostly_null_col) + assert values == [0.0, 1.0, 1.0, 1.0, 1.0] + + # some_null should be kept as-is (40% < 50% threshold) + assert "some_null" in sbd.column_names(transformed) + + # normal should be kept as-is + assert "normal" in sbd.column_names(transformed) + + def test_date_format(df_module): # Test that the date format is correctly inferred