Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ New features
- :meth:`DataOp.skb.full_report` now accepts a new parameter, title, that is displayed
in the html report.
:pr:`1654` by :user:`Marie Sacksick <MarieSacksick>`.
- :class:`DropUninformative` now replaces columns with many null values (but not
entirely null) with missing indicator columns instead of dropping them, preserving
information about whether values were present while avoiding spending feature
dimensions on encoding the actual values. Updated in :pr:`1723` by
:user:`Hanh Tran <honghanhh>`.

Changes
-------
Expand Down
4 changes: 2 additions & 2 deletions skrub/_data_ops/_choosing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ def __repr__(self):
class Choice(BaseChoice):
"""A choice among an enumerated set of outcomes."""

outcomes: list[typing.Any]
outcome_names: typing.Optional[list[str]]
outcomes: typing.List[typing.Any]
outcome_names: typing.Optional[typing.List[str]]
name: typing.Optional[str] = None
chosen_outcome_idx: typing.Optional[int] = None

Expand Down
107 changes: 93 additions & 14 deletions skrub/_drop_uninformative.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ class DropUninformative(SingleColumnTransformer):
Columns are considered "uninformative" if the fraction of missing values is larger
than a threshold, if they contain one unique value, or if all values are unique.

When a column has too many null values (but not entirely null), it is replaced
by a missing indicator column instead of being dropped. This preserves information
about whether values were present or not while avoiding spending feature dimensions
on encoding the actual values. Columns that are entirely null are still dropped
(as the missing indicator would be constant).

Parameters
----------
drop_if_constant : bool, default=False
Expand All @@ -26,8 +32,14 @@ class DropUninformative(SingleColumnTransformer):
lead to dropping columns that contain free-flowing text.

drop_null_fraction : float or None, default=1.0
Drop columns with a fraction of missing values larger than threshold. If None,
keep the column even if all its values are missing.
Fraction of missing values above which the column is replaced by a missing
indicator (instead of being dropped), or dropped if entirely null. If
``drop_null_fraction`` is set to ``1.0``, only entirely null columns are
dropped. If ``drop_null_fraction`` is a number in ``[0.0, 1.0)``, columns
with a fraction of nulls strictly larger than this threshold are replaced by
a missing indicator (or dropped if entirely null). If ``drop_null_fraction``
is ``None``, this check is disabled: no columns are replaced or dropped based
on the number of null values they contain.

See Also
--------
Expand All @@ -41,8 +53,10 @@ class DropUninformative(SingleColumnTransformer):
A column is considered to be "uninformative" if one or more of the following
issues are found:

- The fraction of missing values is larger than a certain fraction (by default,
all values must be null for the column to be dropped).
- The fraction of missing values is larger than a certain fraction. In this case,
if the column is entirely null, it is dropped. Otherwise, it is replaced by
a missing indicator column (a boolean/float32 column indicating positions
where values were missing).
- The column includes only one unique value (the column is constant). Missing
values are considered a separate value.
- The number of unique values in the column is equal to the length of the
Expand All @@ -56,19 +70,44 @@ class DropUninformative(SingleColumnTransformer):
>>> import pandas as pd
>>> df = pd.DataFrame({"col1": [None, None, None]})

By default, only null columns are dropped:
By default, only entirely null columns are dropped. Columns with some nulls
are kept as-is:

>>> du = DropUninformative()
>>> du.fit_transform(df["col1"])
[]

It is also possible to drop constant columns, or specify a lower null fraction
threshold:
>>> df = pd.DataFrame({"col1": [1, None, None, None]})
>>> result = du.fit_transform(df["col1"])
>>> result
0 1.0
1 NaN
2 NaN
3 NaN
Name: col1, dtype: float64

Columns with many nulls (but not entirely null) are replaced by missing indicators
when using a lower threshold:

>>> df = pd.DataFrame({"col1": [1, None, None, None]})
>>> du = DropUninformative(drop_null_fraction=0.5)
>>> result = du.fit_transform(df["col1"])
>>> result
0 0.0
1 1.0
2 1.0
3 1.0
Name: col1, dtype: float32

With a lower threshold, more columns are replaced:

>>> df = pd.DataFrame({"col1": [1, 2, None], "col2": ["const", "const", "const"]})
>>> du = DropUninformative(drop_if_constant=True, drop_null_fraction=0.1)
>>> du.fit_transform(df["col1"])
[]
0 0.0
1 0.0
2 1.0
Name: col1, dtype: float32
>>> du.fit_transform(df["col2"])
[]

Expand Down Expand Up @@ -111,13 +150,32 @@ def _check_params(self):
" should be a number in the range [0, 1], or None."
)

def _should_replace_with_missing_indicator(self, column):
"""Check if column should be replaced with missing indicator
instead of dropped."""
if self.drop_null_fraction is None:
return False
if self.drop_null_fraction == 1.0:
# With default threshold, only drop entirely null columns,
# don't replace others
return False
# For other thresholds, replace if fraction exceeds threshold
# but not entirely null
if self._null_count == 0:
return False
null_fraction = self._null_count / len(column)
return null_fraction > self.drop_null_fraction and self._null_count < len(
column
)

def _drop_if_too_many_nulls(self, column):
if self.drop_null_fraction == 1.0:
return self._null_count == len(column)
# No nulls found, or no threshold
if self._null_count == 0 or self.drop_null_fraction is None:
return False
return self._null_count / len(column) > self.drop_null_fraction
# Only drop if entirely null (otherwise we replace with missing indicator)
return self._null_count == len(column)

def _drop_if_constant(self, column):
if self.drop_if_constant:
Expand Down Expand Up @@ -147,8 +205,8 @@ def fit_transform(self, column, y=None):
Returns
-------
column
The input column, or an empty list if the column is chosen to be
dropped.
The input column, a missing indicator column (boolean), or an empty list
if the column is chosen to be dropped.
"""
del y

Expand All @@ -157,6 +215,10 @@ def fit_transform(self, column, y=None):
# Count nulls
self._null_count = sum(sbd.is_null(column))

self.replace_with_indicator_ = self._should_replace_with_missing_indicator(
column
)

self.drop_ = any(
check(column)
for check in [
Expand All @@ -166,7 +228,12 @@ def fit_transform(self, column, y=None):
]
)

self.all_outputs_ = [] if self.drop_ else [sbd.name(column)]
if self.replace_with_indicator_:
# Store original column name for the missing indicator
self.original_column_name_ = sbd.name(column)
self.all_outputs_ = [sbd.name(column)]
else:
self.all_outputs_ = [] if self.drop_ else [sbd.name(column)]

return self.transform(column)

Expand All @@ -181,11 +248,23 @@ def transform(self, column):
Returns
-------
column
The input column, or an empty list if the column is chosen to be
dropped.
The input column, a missing indicator column (boolean), or an empty list
if the column is chosen to be dropped.
"""
check_is_fitted(self, "all_outputs_")

if self.drop_:
return []

if self.replace_with_indicator_:
# Return a boolean column indicating missing values
# (1.0 for missing, 0.0 for present)
missing_mask = sbd.is_null(column)
# Convert boolean to float32 column
missing_indicator_values = sbd.to_float32(missing_mask)
missing_indicator = sbd.make_column_like(
column, missing_indicator_values, sbd.name(column)
)
return missing_indicator

return column
39 changes: 23 additions & 16 deletions skrub/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,14 @@ class Cleaner(TransformerMixin, BaseEstimator):
Parameters
----------
drop_null_fraction : float or None, default=1.0
Fraction of null above which the column is dropped. If ``drop_null_fraction``
is set to ``1.0``, the column is dropped if it contains only
nulls or NaNs (this is the default behavior). If ``drop_null_fraction`` is a
number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls
is strictly larger than ``drop_null_fraction``. If ``drop_null_fraction`` is
``None``, this selection is disabled: no columns are dropped based on the
Fraction of null above which the column is replaced by a missing indicator
(instead of being dropped), or dropped if entirely null. If
``drop_null_fraction`` is set to ``1.0``, only entirely null columns are
dropped (this is the default behavior). If ``drop_null_fraction`` is a
number in ``[0.0, 1.0)``, columns with a fraction of nulls strictly larger
than ``drop_null_fraction`` are replaced by a missing indicator (or
dropped if entirely null). If ``drop_null_fraction`` is ``None``, this
selection is disabled: no columns are replaced or dropped based on the
number of null values they contain.

drop_if_constant : bool, default=False
Expand Down Expand Up @@ -216,9 +218,11 @@ class Cleaner(TransformerMixin, BaseEstimator):
with NA markers.

- ``DropUninformative()``: drop the column if it is considered to be
"uninformative". A column is considered to be "uninformative" if it contains
only missing values (``drop_null_fraction``), only a constant value
(``drop_if_constant``), or if all values are distinct (``drop_if_unique``).
"uninformative", or replace it with a missing indicator. A column is considered
to be "uninformative" if it contains only missing values (``drop_null_fraction``),
only a constant value (``drop_if_constant``), or if all values are distinct
(``drop_if_unique``). When a column has too many null values (but not entirely
null), it is replaced by a missing indicator column instead of being dropped.
By default, the ``Cleaner`` keeps all columns, unless they contain only
missing values.
Note that setting ``drop_if_unique`` to ``True`` may lead to dropping columns
Expand Down Expand Up @@ -447,12 +451,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
:class:`~sklearn.compose.ColumnTransformer`.

drop_null_fraction : float or None, default=1.0
Fraction of null above which the column is dropped. If `drop_null_fraction` is
set to ``1.0``, the column is dropped if it contains only
nulls or NaNs (this is the default behavior). If `drop_null_fraction` is a
number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls
is strictly larger than `drop_null_fraction`. If `drop_null_fraction` is ``None``,
this selection is disabled: no columns are dropped based on the number
Fraction of null above which the column is replaced by a missing indicator
(instead of being dropped), or dropped if entirely null. If `drop_null_fraction` is
set to ``1.0``, only entirely null columns are dropped (this is the default
behavior). If `drop_null_fraction` is a number in ``[0.0, 1.0)``, columns
with a fraction of nulls strictly larger than `drop_null_fraction` are replaced
by a missing indicator (or dropped if entirely null). If `drop_null_fraction` is ``None``,
this selection is disabled: no columns are replaced or dropped based on the number
of null values they contain.

drop_if_constant : bool, default=False
Expand Down Expand Up @@ -625,7 +630,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
Before applying the main transformer, the ``TableVectorizer`` applies
several preprocessing steps, for example to detect numbers or dates that are
represented as strings. By default, columns that contain only null values are
dropped. Moreover, a final post-processing step is applied to all
dropped. Columns with many nulls (but not entirely null) are replaced by missing
indicator columns instead of being dropped, preserving information about whether
values were present or not. Moreover, a final post-processing step is applied to all
non-categorical columns in the encoder's output to cast them to float32.
If ``datetime_format`` is provided, it will be used to parse all datetime
columns.
Expand Down
5 changes: 3 additions & 2 deletions skrub/datasets/_ken_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Required for ignoring lines too long in the docstrings
# flake8: noqa: E501

import typing
import urllib.request
import warnings
from dataclasses import dataclass
Expand Down Expand Up @@ -58,7 +59,7 @@ class DatasetAll:
X: pd.DataFrame
y: pd.Series
path: Path
read_csv_kwargs: dict[str]
read_csv_kwargs: typing.Dict[str, typing.Any]

def __eq__(self, other):
"""
Expand Down Expand Up @@ -96,7 +97,7 @@ class DatasetInfoOnly:
source: str
target: str
path: Path
read_csv_kwargs: dict[str]
read_csv_kwargs: typing.Dict[str, typing.Any]


def fetch_figshare(
Expand Down
27 changes: 24 additions & 3 deletions skrub/tests/test_drop_uninformative.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ def drop_null_table(df_module):
(dict(drop_null_fraction=0.5), "idx", [1, 2, 3]),
(dict(drop_null_fraction=0.5), "value_nan", []),
(dict(drop_null_fraction=0.5), "value_null", []),
(dict(drop_null_fraction=0.5), "value_almost_nan", []),
(dict(drop_null_fraction=0.5), "value_almost_null", []),
# value_almost_nan: 2/3 nulls (66.7% > 0.5), replaced with missing indicator
(dict(drop_null_fraction=0.5), "value_almost_nan", [0.0, 1.0, 1.0]),
# value_almost_null: 2/3 nulls (66.7% > 0.5), replaced with missing indicator
(dict(drop_null_fraction=0.5), "value_almost_null", [0.0, 1.0, 1.0]),
(dict(drop_null_fraction=0.5), "value_mostly_not_nan", [2.5, 2.5, np.nan]),
(
dict(drop_null_fraction=0.5),
Expand All @@ -77,7 +79,26 @@ def test_drop_nulls(df_module, drop_null_table, params, column, result):
if result == []:
assert res == result
else:
df_module.assert_column_equal(res, df_module.make_column(column, result))
# For missing indicators (float32), check values directly
if (
params.get("drop_null_fraction", 1.0) != 1.0
and params.get("drop_null_fraction") is not None
and isinstance(result, list)
and all(isinstance(x, (int, float)) and 0 <= x <= 1 for x in result)
):
# This is a missing indicator - check values match
res_values = sbd.to_list(res)
assert len(res_values) == len(result)
for r_val, expected_val in zip(res_values, result):
assert abs(r_val - expected_val) < 1e-6
# Check dtype is float32
dtype_obj = sbd.dtype(res)
dtype_name = getattr(dtype_obj, "name", None)
assert (dtype_name is not None and dtype_name.lower() == "float32") or str(
dtype_obj
).lower() == "float32"
else:
df_module.assert_column_equal(res, df_module.make_column(column, result))


def test_do_not_drop_nulls(df_module, drop_null_table):
Expand Down
36 changes: 36 additions & 0 deletions skrub/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,42 @@ def test_drop_null_column(df_module):
assert sbd.shape(transformed) == (sbd.shape(X)[0], 1)


def test_missing_indicator_replacement(df_module):
"""Check that columns with too many nulls are replaced with missing indicators."""
pytest.importorskip("pyarrow")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this skip added?


# Create a dataframe with a column that has many nulls but not entirely null
X = df_module.make_dataframe(
{
"normal": [1, 2, 3, 4, 5],
"mostly_null": [1.0, np.nan, np.nan, np.nan, np.nan], # 80% null
"some_null": [1, 2, None, None, 5], # 40% null
}
)

# With threshold 0.5, mostly_null should be replaced with missing indicator
tv = TableVectorizer(drop_null_fraction=0.5)
transformed = tv.fit_transform(X)

# Check that mostly_null was replaced with a missing indicator
assert "mostly_null" in sbd.column_names(transformed)
mostly_null_col = sbd.col(transformed, "mostly_null")
# Should be float32 with values [0.0, 1.0, 1.0, 1.0, 1.0]
dtype_obj = sbd.dtype(mostly_null_col)
dtype_str = str(dtype_obj)
assert (
getattr(dtype_obj, "name", None) == "float32" or dtype_str.lower() == "float32"
)
values = sbd.to_list(mostly_null_col)
assert values == [0.0, 1.0, 1.0, 1.0, 1.0]

# some_null should be kept as-is (40% < 50% threshold)
assert "some_null" in sbd.column_names(transformed)

# normal should be kept as-is
assert "normal" in sbd.column_names(transformed)


def test_date_format(df_module):
# Test that the date format is correctly inferred

Expand Down