skrub-data · honghanhh · Nov 1, 2025 · rcap107 · Nov 3, 2025
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -30,6 +30,11 @@ New features
 - :meth:`DataOp.skb.full_report` now accepts a new parameter, title, that is displayed
   in the html report.
   :pr:`1654` by :user:`Marie Sacksick <MarieSacksick>`.
+- :class:`DropUninformative` now replaces columns with many null values (but not
+  entirely null) with missing indicator columns instead of dropping them, preserving
+  information about whether values were present while avoiding spending feature
+  dimensions on encoding the actual values. Updated in :pr:`1723` by
+  :user:`Hanh Tran <honghanhh>`.
 
 Changes
 -------

diff --git a/skrub/_data_ops/_choosing.py b/skrub/_data_ops/_choosing.py
@@ -109,8 +109,8 @@ def __repr__(self):
 class Choice(BaseChoice):
     """A choice among an enumerated set of outcomes."""
 
-    outcomes: list[typing.Any]
-    outcome_names: typing.Optional[list[str]]
+    outcomes: typing.List[typing.Any]
+    outcome_names: typing.Optional[typing.List[str]]
     name: typing.Optional[str] = None
     chosen_outcome_idx: typing.Optional[int] = None
 

diff --git a/skrub/_drop_uninformative.py b/skrub/_drop_uninformative.py
@@ -14,6 +14,12 @@ class DropUninformative(SingleColumnTransformer):
     Columns are considered "uninformative" if the fraction of missing values is larger
     than a threshold, if they contain one unique value, or if all values are unique.
 
+    When a column has too many null values (but not entirely null), it is replaced
+    by a missing indicator column instead of being dropped. This preserves information
+    about whether values were present or not while avoiding spending feature dimensions
+    on encoding the actual values. Columns that are entirely null are still dropped
+    (as the missing indicator would be constant).
+
     Parameters
     ----------
     drop_if_constant : bool, default=False
@@ -26,8 +32,14 @@ class DropUninformative(SingleColumnTransformer):
         lead to dropping columns that contain free-flowing text.
 
     drop_null_fraction : float or None, default=1.0
-        Drop columns with a fraction of missing values larger than threshold. If None,
-        keep the column even if all its values are missing.
+        Fraction of missing values above which the column is replaced by a missing
+        indicator (instead of being dropped), or dropped if entirely null. If
+        ``drop_null_fraction`` is set to ``1.0``, only entirely null columns are
+        dropped. If ``drop_null_fraction`` is a number in ``[0.0, 1.0)``, columns
+        with a fraction of nulls strictly larger than this threshold are replaced by
+        a missing indicator (or dropped if entirely null). If ``drop_null_fraction``
+        is ``None``, this check is disabled: no columns are replaced or dropped based
+        on the number of null values they contain.
 
     See Also
     --------
@@ -41,8 +53,10 @@ class DropUninformative(SingleColumnTransformer):
     A column is considered to be "uninformative" if one or more of the following
     issues are found:
 
-    - The fraction of missing values is larger than a certain fraction (by default,
-      all values must be null for the column to be dropped).
+    - The fraction of missing values is larger than a certain fraction. In this case,
+      if the column is entirely null, it is dropped. Otherwise, it is replaced by
+      a missing indicator column (a boolean/float32 column indicating positions
+      where values were missing).
     - The column includes only one unique value (the column is constant). Missing
       values are considered a separate value.
     - The number of unique values in the column is equal to the length of the
@@ -56,19 +70,44 @@ class DropUninformative(SingleColumnTransformer):
     >>> import pandas as pd
     >>> df = pd.DataFrame({"col1": [None, None, None]})
 
-    By default, only null columns are dropped:
+    By default, only entirely null columns are dropped. Columns with some nulls
+    are kept as-is:
 
     >>> du = DropUninformative()
     >>> du.fit_transform(df["col1"])
     []
 
-    It is also possible to drop constant columns, or specify a lower null fraction
-    threshold:
+    >>> df = pd.DataFrame({"col1": [1, None, None, None]})
+    >>> result = du.fit_transform(df["col1"])
+    >>> result
+    0      1.0
+    1      NaN
+    2      NaN
+    3      NaN
+    Name: col1, dtype: float64
+
+    Columns with many nulls (but not entirely null) are replaced by missing indicators
+    when using a lower threshold:
+
+    >>> df = pd.DataFrame({"col1": [1, None, None, None]})
+    >>> du = DropUninformative(drop_null_fraction=0.5)
+    >>> result = du.fit_transform(df["col1"])
+    >>> result
+    0    0.0
+    1    1.0
+    2    1.0
+    3    1.0
+    Name: col1, dtype: float32
+
+    With a lower threshold, more columns are replaced:
 
     >>> df = pd.DataFrame({"col1": [1, 2, None], "col2": ["const", "const", "const"]})
     >>> du = DropUninformative(drop_if_constant=True, drop_null_fraction=0.1)
     >>> du.fit_transform(df["col1"])
-    []
+    0    0.0
+    1    0.0
+    2    1.0
+    Name: col1, dtype: float32
     >>> du.fit_transform(df["col2"])
     []
 
@@ -111,13 +150,32 @@ def _check_params(self):
                     " should be a number in the range [0, 1], or None."
                 )
 
+    def _should_replace_with_missing_indicator(self, column):
+        """Check if column should be replaced with missing indicator
+        instead of dropped."""
+        if self.drop_null_fraction is None:
+            return False
+        if self.drop_null_fraction == 1.0:
+            # With default threshold, only drop entirely null columns,
+            # don't replace others
+            return False
+        # For other thresholds, replace if fraction exceeds threshold
+        # but not entirely null
+        if self._null_count == 0:
+            return False
+        null_fraction = self._null_count / len(column)
+        return null_fraction > self.drop_null_fraction and self._null_count < len(
+            column
+        )
+
     def _drop_if_too_many_nulls(self, column):
         if self.drop_null_fraction == 1.0:
             return self._null_count == len(column)
         # No nulls found, or no threshold
         if self._null_count == 0 or self.drop_null_fraction is None:
             return False
-        return self._null_count / len(column) > self.drop_null_fraction
+        # Only drop if entirely null (otherwise we replace with missing indicator)
+        return self._null_count == len(column)
 
     def _drop_if_constant(self, column):
         if self.drop_if_constant:
@@ -147,8 +205,8 @@ def fit_transform(self, column, y=None):
         Returns
         -------
         column
-            The input column, or an empty list if the column is chosen to be
-            dropped.
+            The input column, a missing indicator column (boolean), or an empty list
+            if the column is chosen to be dropped.
         """
         del y
 
@@ -157,6 +215,10 @@ def fit_transform(self, column, y=None):
         # Count nulls
         self._null_count = sum(sbd.is_null(column))
 
+        self.replace_with_indicator_ = self._should_replace_with_missing_indicator(
+            column
+        )
+
         self.drop_ = any(
             check(column)
             for check in [
@@ -166,7 +228,12 @@ def fit_transform(self, column, y=None):
             ]
         )
 
-        self.all_outputs_ = [] if self.drop_ else [sbd.name(column)]
+        if self.replace_with_indicator_:
+            # Store original column name for the missing indicator
+            self.original_column_name_ = sbd.name(column)
+            self.all_outputs_ = [sbd.name(column)]
+        else:
+            self.all_outputs_ = [] if self.drop_ else [sbd.name(column)]
 
         return self.transform(column)
 
@@ -181,11 +248,23 @@ def transform(self, column):
         Returns
         -------
         column
-            The input column, or an empty list if the column is chosen to be
-            dropped.
+            The input column, a missing indicator column (boolean), or an empty list
+            if the column is chosen to be dropped.
         """
         check_is_fitted(self, "all_outputs_")
 
         if self.drop_:
             return []
+
+        if self.replace_with_indicator_:
+            # Return a boolean column indicating missing values
+            # (1.0 for missing, 0.0 for present)
+            missing_mask = sbd.is_null(column)
+            # Convert boolean to float32 column
+            missing_indicator_values = sbd.to_float32(missing_mask)
+            missing_indicator = sbd.make_column_like(
+                column, missing_indicator_values, sbd.name(column)
+            )
+            return missing_indicator
+
         return column
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
@@ -162,12 +162,14 @@ class Cleaner(TransformerMixin, BaseEstimator):
     Parameters
     ----------
     drop_null_fraction : float or None, default=1.0
-        Fraction of null above which the column is dropped. If ``drop_null_fraction``
-        is set to ``1.0``, the column is dropped if it contains only
-        nulls or NaNs (this is the default behavior). If ``drop_null_fraction`` is a
-        number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls
-        is strictly larger than ``drop_null_fraction``. If ``drop_null_fraction`` is
-        ``None``, this selection is disabled: no columns are dropped based on the
+        Fraction of null above which the column is replaced by a missing indicator
+        (instead of being dropped), or dropped if entirely null. If
+        ``drop_null_fraction`` is set to ``1.0``, only entirely null columns are
+        dropped (this is the default behavior). If ``drop_null_fraction`` is a
+        number in ``[0.0, 1.0)``, columns with a fraction of nulls strictly larger
+        than ``drop_null_fraction`` are replaced by a missing indicator (or
+        dropped if entirely null). If ``drop_null_fraction`` is ``None``, this
+        selection is disabled: no columns are replaced or dropped based on the
         number of null values they contain.
 
     drop_if_constant : bool, default=False
@@ -216,9 +218,11 @@ class Cleaner(TransformerMixin, BaseEstimator):
       with NA markers.
 
     - ``DropUninformative()``: drop the column if it is considered to be
-      "uninformative". A column is considered to be "uninformative" if it contains
-      only missing values (``drop_null_fraction``), only a constant value
-      (``drop_if_constant``), or if all values are distinct (``drop_if_unique``).
+      "uninformative", or replace it with a missing indicator. A column is considered
+      to be "uninformative" if it contains only missing values (``drop_null_fraction``),
+      only a constant value (``drop_if_constant``), or if all values are distinct
+      (``drop_if_unique``). When a column has too many null values (but not entirely
+      null), it is replaced by a missing indicator column instead of being dropped.
       By default, the ``Cleaner`` keeps all columns, unless they contain only
       missing values.
       Note that setting ``drop_if_unique`` to ``True`` may lead to dropping columns
@@ -447,12 +451,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         :class:`~sklearn.compose.ColumnTransformer`.
 
     drop_null_fraction : float or None, default=1.0
-        Fraction of null above which the column is dropped. If `drop_null_fraction` is
-        set to ``1.0``, the column is dropped if it contains only
-        nulls or NaNs (this is the default behavior). If `drop_null_fraction` is a
-        number in ``[0.0, 1.0)``, the column is dropped if the fraction of nulls
-        is strictly larger than `drop_null_fraction`. If `drop_null_fraction` is ``None``,
-        this selection is disabled: no columns are dropped based on the number
+        Fraction of null above which the column is replaced by a missing indicator
+        (instead of being dropped), or dropped if entirely null. If `drop_null_fraction` is
+        set to ``1.0``, only entirely null columns are dropped (this is the default
+        behavior). If `drop_null_fraction` is a number in ``[0.0, 1.0)``, columns
+        with a fraction of nulls strictly larger than `drop_null_fraction` are replaced
+        by a missing indicator (or dropped if entirely null). If `drop_null_fraction` is ``None``,
+        this selection is disabled: no columns are replaced or dropped based on the number
         of null values they contain.
 
     drop_if_constant : bool, default=False
@@ -625,7 +630,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     Before applying the main transformer, the ``TableVectorizer`` applies
     several preprocessing steps, for example to detect numbers or dates that are
     represented as strings. By default, columns that contain only null values are
-    dropped. Moreover, a final post-processing step is applied to all
+    dropped. Columns with many nulls (but not entirely null) are replaced by missing
+    indicator columns instead of being dropped, preserving information about whether
+    values were present or not. Moreover, a final post-processing step is applied to all
     non-categorical columns in the encoder's output to cast them to float32.
     If ``datetime_format`` is provided, it will be used to parse all datetime
     columns.

diff --git a/skrub/datasets/_ken_embeddings.py b/skrub/datasets/_ken_embeddings.py
@@ -4,6 +4,7 @@
 # Required for ignoring lines too long in the docstrings
 # flake8: noqa: E501
 
+import typing
 import urllib.request
 import warnings
 from dataclasses import dataclass
@@ -58,7 +59,7 @@ class DatasetAll:
     X: pd.DataFrame
     y: pd.Series
     path: Path
-    read_csv_kwargs: dict[str]
+    read_csv_kwargs: typing.Dict[str, typing.Any]
 
     def __eq__(self, other):
         """
@@ -96,7 +97,7 @@ class DatasetInfoOnly:
     source: str
     target: str
     path: Path
-    read_csv_kwargs: dict[str]
+    read_csv_kwargs: typing.Dict[str, typing.Any]
 
 
 def fetch_figshare(

diff --git a/skrub/tests/test_drop_uninformative.py b/skrub/tests/test_drop_uninformative.py
@@ -61,8 +61,10 @@ def drop_null_table(df_module):
         (dict(drop_null_fraction=0.5), "idx", [1, 2, 3]),
         (dict(drop_null_fraction=0.5), "value_nan", []),
         (dict(drop_null_fraction=0.5), "value_null", []),
-        (dict(drop_null_fraction=0.5), "value_almost_nan", []),
-        (dict(drop_null_fraction=0.5), "value_almost_null", []),
+        # value_almost_nan: 2/3 nulls (66.7% > 0.5), replaced with missing indicator
+        (dict(drop_null_fraction=0.5), "value_almost_nan", [0.0, 1.0, 1.0]),
+        # value_almost_null: 2/3 nulls (66.7% > 0.5), replaced with missing indicator
+        (dict(drop_null_fraction=0.5), "value_almost_null", [0.0, 1.0, 1.0]),
         (dict(drop_null_fraction=0.5), "value_mostly_not_nan", [2.5, 2.5, np.nan]),
         (
             dict(drop_null_fraction=0.5),
@@ -77,7 +79,26 @@ def test_drop_nulls(df_module, drop_null_table, params, column, result):
     if result == []:
         assert res == result
     else:
-        df_module.assert_column_equal(res, df_module.make_column(column, result))
+        # For missing indicators (float32), check values directly
+        if (
+            params.get("drop_null_fraction", 1.0) != 1.0
+            and params.get("drop_null_fraction") is not None
+            and isinstance(result, list)
+            and all(isinstance(x, (int, float)) and 0 <= x <= 1 for x in result)
+        ):
+            # This is a missing indicator - check values match
+            res_values = sbd.to_list(res)
+            assert len(res_values) == len(result)
+            for r_val, expected_val in zip(res_values, result):
+                assert abs(r_val - expected_val) < 1e-6
+            # Check dtype is float32
+            dtype_obj = sbd.dtype(res)
+            dtype_name = getattr(dtype_obj, "name", None)
+            assert (dtype_name is not None and dtype_name.lower() == "float32") or str(
+                dtype_obj
+            ).lower() == "float32"
+        else:
+            df_module.assert_column_equal(res, df_module.make_column(column, result))
 
 
 def test_do_not_drop_nulls(df_module, drop_null_table):

diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
@@ -975,6 +975,42 @@ def test_drop_null_column(df_module):
     assert sbd.shape(transformed) == (sbd.shape(X)[0], 1)
 
 
+def test_missing_indicator_replacement(df_module):
+    """Check that columns with too many nulls are replaced with missing indicators."""
+    pytest.importorskip("pyarrow")
+
+    # Create a dataframe with a column that has many nulls but not entirely null
+    X = df_module.make_dataframe(
+        {
+            "normal": [1, 2, 3, 4, 5],
+            "mostly_null": [1.0, np.nan, np.nan, np.nan, np.nan],  # 80% null
+            "some_null": [1, 2, None, None, 5],  # 40% null
+        }
+    )
+
+    # With threshold 0.5, mostly_null should be replaced with missing indicator
+    tv = TableVectorizer(drop_null_fraction=0.5)
+    transformed = tv.fit_transform(X)
+
+    # Check that mostly_null was replaced with a missing indicator
+    assert "mostly_null" in sbd.column_names(transformed)
+    mostly_null_col = sbd.col(transformed, "mostly_null")
+    # Should be float32 with values [0.0, 1.0, 1.0, 1.0, 1.0]
+    dtype_obj = sbd.dtype(mostly_null_col)
+    dtype_str = str(dtype_obj)
+    assert (
+        getattr(dtype_obj, "name", None) == "float32" or dtype_str.lower() == "float32"
+    )
+    values = sbd.to_list(mostly_null_col)
+    assert values == [0.0, 1.0, 1.0, 1.0, 1.0]
+
+    # some_null should be kept as-is (40% < 50% threshold)
+    assert "some_null" in sbd.column_names(transformed)
+
+    # normal should be kept as-is
+    assert "normal" in sbd.column_names(transformed)
+
+
 def test_date_format(df_module):
     # Test that the date format is correctly inferred