skrub-data
diff --git a/‎build_tools/generate_data_ops_stub.py‎
Lines changed: 4 additions & 2 deletions b/‎build_tools/generate_data_ops_stub.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎doc/generate_data_ops_example_for_index.py‎
Lines changed: 1 addition & 0 deletions b/‎doc/generate_data_ops_example_for_index.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/multi_column_operations/multi_column_operations.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/modules/multi_column_operations/multi_column_operations.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎skrub/_apply_to_cols.py‎
Lines changed: 5 additions & 207 deletions b/‎skrub/_apply_to_cols.py‎
Lines changed: 5 additions & 207 deletions
diff --git a/‎skrub/_clean_categories.py‎
Lines changed: 2 additions & 2 deletions b/‎skrub/_clean_categories.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎skrub/_clean_null_strings.py‎
Lines changed: 5 additions & 5 deletions b/‎skrub/_clean_null_strings.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎skrub/_datetime_encoder.py‎
Lines changed: 2 additions & 2 deletions b/‎skrub/_datetime_encoder.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎skrub/_dispatch.py‎
Lines changed: 4 additions & 2 deletions b/‎skrub/_dispatch.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎skrub/_drop_uninformative.py‎
Lines changed: 1 addition & 1 deletion b/‎skrub/_drop_uninformative.py‎
Lines changed: 1 addition & 1 deletion
@@ -50,7 +50,8 @@
 file = io.StringIO()
 p = file.write
 
-p("""\
+p(
+    """\
 # fmt: off
 # ruff: noqa
 
@@ -63,7 +64,8 @@
 
 class DataOp:
     skb: SkrubNamespace
-""")
+"""
+)
 
 for name in sorted(skrub.DataOp.__dict__):
     if name not in [
 
@@ -5,6 +5,7 @@
 
 The other cells are used to generate the HTML snippets for the index page.
 """
+
 # We want manual control over the formatting as those snippets are shown in the home page
 # fmt: off
 # ruff: noqa
 
@@ -98,7 +98,7 @@ Name: birthday, dtype: datetime64[...]
 >>> ToDatetime().fit_transform(df["city"])
 Traceback (most recent call last):
     ...
-skrub._apply_to_cols.RejectColumn: Could not find a datetime format for column 'city'.
+skrub._single_column_transformer.RejectColumn: Could not find a datetime format for column 'city'.
 
 It is possible to change how rejected columns are handled through the ``allow_reject``
 parameter.
 
@@ -294,6 +294,8 @@ ignore = [
 # folder.
 "examples/*" = ["E402"]
 "doc/conf.py" = ["E402"]
+# Long exception messages in docstrings
+"skrub/_clean_null_strings.py" = ["E501"]
 
 [tool.pytest.ini_options]
 filterwarnings = [
 
@@ -1,7 +1,4 @@
-import functools
 import itertools
-import re
-import textwrap
 
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator, TransformerMixin, clone
@@ -10,213 +7,14 @@
 from . import _dataframe as sbd
 from . import _utils, selectors
 from ._join_utils import pick_column_names
+from ._single_column_transformer import RejectColumn
 
-__all__ = ["ApplyToCols", "SingleColumnTransformer", "RejectColumn"]
+__all__ = ["ApplyToCols"]
 
-_SINGLE_COL_LINE = (
-    "``{class_name}`` is a type of single-column transformer. Unlike most scikit-learn"
-    " estimators, its ``fit``, ``transform`` and ``fit_transform`` methods expect a"
-    " single column (a pandas or polars Series) rather than a full dataframe. To apply"
-    " this transformer to one or more columns in a dataframe, use it as a parameter in"
-    " a ``skrub.ApplyToCols`` or a ``skrub.TableVectorizer``."
-    " To apply to all columns::\n\n"
-    "   ApplyToCols({class_name}())\n\n"
-    "To apply to selected columns::\n\n"
-    "   ApplyToCols({class_name}(), cols=['col_name_1', 'col_name_2'])"
-)
-_SINGLE_COL_PARAGRAPH = textwrap.fill(
-    _SINGLE_COL_LINE, initial_indent="    ", subsequent_indent="    "
-)
-_SINGLE_COL_NOTE = f".. note::\n\n{_SINGLE_COL_PARAGRAPH}\n"
 
-
-class RejectColumn(ValueError):
-    """Used by single-column transformers to indicate they do not apply to a column.
-
-    >>> import pandas as pd
-    >>> from skrub import ToDatetime
-    >>> df = pd.DataFrame(dict(a=['2020-02-02'], b=[12.5]))
-    >>> ToDatetime().fit_transform(df['a'])
-    0   2020-02-02
-    Name: a, dtype: datetime64[...]
-    >>> ToDatetime().fit_transform(df['b'])
-    Traceback (most recent call last):
-        ...
-    skrub._apply_to_cols.RejectColumn: Column 'b' does not contain strings.
+class ApplyToCols(BaseEstimator, TransformerMixin):
     """
-
-    pass
-
-
-class SingleColumnTransformer(BaseEstimator):
-    """Base class for single-column transformers.
-
-    Such transformers are applied independently to each column by
-    ``ApplyToCols``; see the docstring of ``ApplyToCols`` for more
-    information.
-
-    Single-column transformers are not required to inherit from this class in
-    order to work with ``ApplyToCols``, however doing so avoids some
-    boilerplate:
-
-        - The required ``__single_column_transformer__`` attribute is set.
-        - ``fit`` is defined (calls ``fit_transform`` and discards the result).
-        - ``fit``, ``transform`` and ``fit_transform`` are wrapped to check
-          that the input is a single column and raise a ``ValueError`` with a
-          helpful message when it is not.
-        - A note about single-column transformers (vs dataframe transformers)
-          is added after the summary line of the docstring.
-
-    Subclasses must define ``fit_transform`` and ``transform`` (or inherit them
-    from another superclass).
-    """
-
-    __single_column_transformer__ = True
-
-    def fit(self, column, y=None, **kwargs):
-        """Fit the transformer.
-
-        This default implementation simply calls ``fit_transform()`` and
-        returns ``self``.
-
-        Subclasses should implement ``fit_transform`` and ``transform``.
-
-        Parameters
-        ----------
-        column : a pandas or polars Series
-            Unlike most scikit-learn transformers, single-column transformers
-            transform a single column, not a whole dataframe.
-
-        y : column or dataframe
-            Prediction targets.
-
-        **kwargs
-            Extra named arguments are passed to ``self.fit_transform()``.
-
-        Returns
-        -------
-        self
-            The fitted transformer.
-        """
-        self.fit_transform(column, y=y, **kwargs)
-        return self
-
-    def _check_single_column(self, column, function_name):
-        class_name = self.__class__.__name__
-        if sbd.is_dataframe(column):
-            raise ValueError(
-                f"``{class_name}.{function_name}`` should be passed a single column,"
-                " not a dataframe. " + _SINGLE_COL_LINE.format(class_name=class_name)
-            )
-        if not sbd.is_column(column):
-            raise ValueError(
-                f"``{class_name}.{function_name}`` expects the first argument X "
-                "to be a column (a pandas or polars Series). "
-                f"Got X with type: {column.__class__.__name__}."
-            )
-        return column
-
-    def __init_subclass__(subclass, **kwargs):
-        super().__init_subclass__(**kwargs)
-        if subclass.__doc__ is not None:
-            subclass.__doc__ = _insert_after_first_paragraph(
-                subclass.__doc__,
-                _SINGLE_COL_NOTE.format(class_name=subclass.__name__),
-            )
-        for method in "fit", "fit_transform", "transform", "partial_fit":
-            if method in subclass.__dict__:
-                wrapped = _wrap_add_check_single_column(getattr(subclass, method))
-                setattr(subclass, method, wrapped)
-
-    def get_feature_names_out(self, input_features=None):
-        """Get the output feature names.
-
-        Parameters
-        -----------
-        input_features : array-like of str, default=None
-            Input feature names. Ignored.
-
-        Returns
-        --------
-        all_outputs_
-            The names of the output features.
-        """
-        check_is_fitted(self, "all_outputs_")
-        return self.all_outputs_
-
-
-def _wrap_add_check_single_column(f):
-    # as we have only a few predefined functions to handle, using their exact
-    # name and signature in the wrapper definition gives better tracebacks and
-    # autocompletion than just functools.wraps / setting __name__ and
-    # __signature__
-    if f.__name__ == "fit":
-
-        @functools.wraps(f)
-        def fit(self, X, y=None, **kwargs):
-            self._check_single_column(X, f.__name__)
-            return f(self, X, y=y, **kwargs)
-
-        return fit
-    elif f.__name__ == "partial_fit":
-
-        @functools.wraps(f)
-        def partial_fit(self, X, y=None, **kwargs):
-            self._check_single_column(X, f.__name__)
-            return f(self, X, y=y, **kwargs)
-
-        return partial_fit
-
-    elif f.__name__ == "fit_transform":
-
-        @functools.wraps(f)
-        def fit_transform(self, X, y=None, **kwargs):
-            self._check_single_column(X, f.__name__)
-            return f(self, X, y=y, **kwargs)
-
-        return fit_transform
-    else:
-        assert f.__name__ == "transform", f.__name__
-
-        @functools.wraps(f)
-        def transform(self, X, **kwargs):
-            self._check_single_column(X, f.__name__)
-            return f(self, X, **kwargs)
-
-        return transform
-
-
-def _insert_after_first_paragraph(document, text_to_insert):
-    split_doc = document.splitlines(True)
-    indent = min(
-        (
-            len(m.group(1))
-            for line in split_doc[1:]
-            if (m := re.match(r"^( *)\S", line)) is not None
-        ),
-        default=0,
-    )
-    doc_lines = iter(split_doc)
-    output_lines = []
-    for line in doc_lines:
-        output_lines.append(line)
-        if line.strip():
-            break
-    for line in doc_lines:
-        output_lines.append(line)
-        if not line.strip():
-            break
-    else:
-        output_lines.append("\n")
-    for line in text_to_insert.splitlines(True):
-        output_lines.append(line if not line.strip() else " " * indent + line)
-    output_lines.append("\n")
-    output_lines.extend(doc_lines)
-    return "".join(output_lines)
-
-
-class ApplyToCols(TransformerMixin, BaseEstimator):
-    """Map a transformer to columns in a dataframe.
+    Map a transformer to columns in a dataframe.
 
     A separate clone of the transformer is applied to each column separately.
 
@@ -371,7 +169,7 @@ class ApplyToCols(TransformerMixin, BaseEstimator):
     >>> ToDatetime().fit_transform(df["city"])
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Could not find a datetime format for column 'city'.
+    skrub._single_column_transformer.RejectColumn: Could not find a datetime format for column 'city'.
 
     How these rejections are handled depends on the ``allow_reject`` parameter.
     By default, no special handling is performed and rejections are considered
 
@@ -1,8 +1,8 @@
 import numpy as np
 
 from . import _dataframe as sbd
-from ._apply_to_cols import RejectColumn, SingleColumnTransformer
 from ._dispatch import dispatch, raise_dispatch_unregistered_type
+from ._single_column_transformer import RejectColumn, SingleColumnTransformer
 
 __all__ = ["CleanCategories"]
 
@@ -150,7 +150,7 @@ class CleanCategories(SingleColumnTransformer):
     >>> cleaner.fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Column 'c' is not categorical.
+    skrub._single_column_transformer.RejectColumn: Column 'c' is not categorical.
 
     However once a column has been accepted, the output of ``transform`` will
     always have a categorical dtype:
 
@@ -1,6 +1,6 @@
 from . import _dataframe as sbd
-from ._apply_to_cols import RejectColumn, SingleColumnTransformer
 from ._dispatch import dispatch, raise_dispatch_unregistered_type
+from ._single_column_transformer import RejectColumn, SingleColumnTransformer
 
 __all__ = ["CleanNullStrings"]
 
@@ -162,7 +162,7 @@ class CleanNullStrings(SingleColumnTransformer):
     >>> ToFloat().fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Could not convert column 's' to numbers.
+    skrub._single_column_transformer.RejectColumn: Could not convert column 's' to numbers.
     >>> ToFloat().fit_transform(cleaner.fit_transform(s))
     0    1.1
     1    2.2
@@ -177,7 +177,7 @@ class CleanNullStrings(SingleColumnTransformer):
     >>> cleaner.fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Column 's' does not contain strings.
+    skrub._single_column_transformer.RejectColumn: Column 's' does not contain strings.
 
     In particular, Categorical columns, although they contain strings, do not
     have the ``string`` or ``object`` ``dtype``:
@@ -186,7 +186,7 @@ class CleanNullStrings(SingleColumnTransformer):
     >>> cleaner.fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Column None does not contain strings.
+    skrub._single_column_transformer.RejectColumn: Column None does not contain strings.
 
     Note however that ``object`` columns are accepted even if they do not
     contain any strings. They will not be modified but they will still be
@@ -229,7 +229,7 @@ class CleanNullStrings(SingleColumnTransformer):
     >>> cleaner.fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Column 's' does not contain strings.
+    skrub._single_column_transformer.RejectColumn: Column 's' does not contain strings.
     """
 
     def fit_transform(self, column, y=None):
 
@@ -6,8 +6,8 @@
 from sklearn.utils.validation import check_is_fitted
 
 from . import _dataframe as sbd
-from ._apply_to_cols import RejectColumn, SingleColumnTransformer
 from ._dispatch import dispatch
+from ._single_column_transformer import RejectColumn, SingleColumnTransformer
 from ._sklearn_compat import TransformerTags
 
 __all__ = ["DatetimeEncoder"]
@@ -257,7 +257,7 @@ class DatetimeEncoder(SingleColumnTransformer):
     >>> DatetimeEncoder().fit_transform(s)
     Traceback (most recent call last):
         ...
-    skrub._apply_to_cols.RejectColumn: Column 'birthday' does not have Date or Datetime dtype.
+    skrub._single_column_transformer.RejectColumn: Column 'birthday' does not have Date or Datetime dtype.
 
     :class:`ToDatetime`: can be used for converting strings to datetimes.
 
 
@@ -253,13 +253,15 @@ def raise_dispatch_unregistered_type(obj, kind="object"):
     from ._data_ops import DataOp
 
     if isinstance(obj, DataOp):
-        raise TypeError(f"""Expected a Pandas or Polars {kind}, but got a skrub DataOp.
+        raise TypeError(
+            f"""Expected a Pandas or Polars {kind}, but got a skrub DataOp.
 A function that expects an actual value cannot be applied directly to a DataOp;
 you may want to (i) use op.skb.eval() or op.skb.preview() to evaluate the
 dataop and turn it into an actual value or (ii) use op.skb.apply_func() or
 op.skb.apply() to schedule the operation for later execution (when the dataop is
 evaluated) rather than computing it immediately.
- """)
+ """
+        )
     raise TypeError(
         "Operation not supported on this object. Expecting a Pandas or Polars "
         f"{kind}, but got an object of type {type(obj)}."
 
@@ -3,7 +3,7 @@
 from sklearn.utils.validation import check_is_fitted
 
 from . import _dataframe as sbd
-from ._apply_to_cols import SingleColumnTransformer
+from ._single_column_transformer import SingleColumnTransformer
 
 __all__ = ["DropUninformative"]