Skip to content

Commit e1727e9

Browse files
techy4shrircap107
andauthored
chore: Refactor SingleColumnTransformer so it's in its own file (#1820)
Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
1 parent 05213dc commit e1727e9

32 files changed

+276
-257
lines changed

build_tools/generate_data_ops_stub.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
file = io.StringIO()
5151
p = file.write
5252

53-
p("""\
53+
p(
54+
"""\
5455
# fmt: off
5556
# ruff: noqa
5657
@@ -63,7 +64,8 @@
6364
6465
class DataOp:
6566
skb: SkrubNamespace
66-
""")
67+
"""
68+
)
6769

6870
for name in sorted(skrub.DataOp.__dict__):
6971
if name not in [

doc/generate_data_ops_example_for_index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
The other cells are used to generate the HTML snippets for the index page.
77
"""
8+
89
# We want manual control over the formatting as those snippets are shown in the home page
910
# fmt: off
1011
# ruff: noqa

doc/modules/multi_column_operations/multi_column_operations.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ Name: birthday, dtype: datetime64[...]
9898
>>> ToDatetime().fit_transform(df["city"])
9999
Traceback (most recent call last):
100100
...
101-
skrub._apply_to_cols.RejectColumn: Could not find a datetime format for column 'city'.
101+
skrub._single_column_transformer.RejectColumn: Could not find a datetime format for column 'city'.
102102

103103
It is possible to change how rejected columns are handled through the ``allow_reject``
104104
parameter.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,8 @@ ignore = [
294294
# folder.
295295
"examples/*" = ["E402"]
296296
"doc/conf.py" = ["E402"]
297+
# Long exception messages in docstrings
298+
"skrub/_clean_null_strings.py" = ["E501"]
297299

298300
[tool.pytest.ini_options]
299301
filterwarnings = [

skrub/_apply_to_cols.py

Lines changed: 5 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
import functools
21
import itertools
3-
import re
4-
import textwrap
52

63
from joblib import Parallel, delayed
74
from sklearn.base import BaseEstimator, TransformerMixin, clone
@@ -10,213 +7,14 @@
107
from . import _dataframe as sbd
118
from . import _utils, selectors
129
from ._join_utils import pick_column_names
10+
from ._single_column_transformer import RejectColumn
1311

14-
__all__ = ["ApplyToCols", "SingleColumnTransformer", "RejectColumn"]
12+
__all__ = ["ApplyToCols"]
1513

16-
_SINGLE_COL_LINE = (
17-
"``{class_name}`` is a type of single-column transformer. Unlike most scikit-learn"
18-
" estimators, its ``fit``, ``transform`` and ``fit_transform`` methods expect a"
19-
" single column (a pandas or polars Series) rather than a full dataframe. To apply"
20-
" this transformer to one or more columns in a dataframe, use it as a parameter in"
21-
" a ``skrub.ApplyToCols`` or a ``skrub.TableVectorizer``."
22-
" To apply to all columns::\n\n"
23-
" ApplyToCols({class_name}())\n\n"
24-
"To apply to selected columns::\n\n"
25-
" ApplyToCols({class_name}(), cols=['col_name_1', 'col_name_2'])"
26-
)
27-
_SINGLE_COL_PARAGRAPH = textwrap.fill(
28-
_SINGLE_COL_LINE, initial_indent=" ", subsequent_indent=" "
29-
)
30-
_SINGLE_COL_NOTE = f".. note::\n\n{_SINGLE_COL_PARAGRAPH}\n"
3114

32-
33-
class RejectColumn(ValueError):
34-
"""Used by single-column transformers to indicate they do not apply to a column.
35-
36-
>>> import pandas as pd
37-
>>> from skrub import ToDatetime
38-
>>> df = pd.DataFrame(dict(a=['2020-02-02'], b=[12.5]))
39-
>>> ToDatetime().fit_transform(df['a'])
40-
0 2020-02-02
41-
Name: a, dtype: datetime64[...]
42-
>>> ToDatetime().fit_transform(df['b'])
43-
Traceback (most recent call last):
44-
...
45-
skrub._apply_to_cols.RejectColumn: Column 'b' does not contain strings.
15+
class ApplyToCols(BaseEstimator, TransformerMixin):
4616
"""
47-
48-
pass
49-
50-
51-
class SingleColumnTransformer(BaseEstimator):
52-
"""Base class for single-column transformers.
53-
54-
Such transformers are applied independently to each column by
55-
``ApplyToCols``; see the docstring of ``ApplyToCols`` for more
56-
information.
57-
58-
Single-column transformers are not required to inherit from this class in
59-
order to work with ``ApplyToCols``, however doing so avoids some
60-
boilerplate:
61-
62-
- The required ``__single_column_transformer__`` attribute is set.
63-
- ``fit`` is defined (calls ``fit_transform`` and discards the result).
64-
- ``fit``, ``transform`` and ``fit_transform`` are wrapped to check
65-
that the input is a single column and raise a ``ValueError`` with a
66-
helpful message when it is not.
67-
- A note about single-column transformers (vs dataframe transformers)
68-
is added after the summary line of the docstring.
69-
70-
Subclasses must define ``fit_transform`` and ``transform`` (or inherit them
71-
from another superclass).
72-
"""
73-
74-
__single_column_transformer__ = True
75-
76-
def fit(self, column, y=None, **kwargs):
77-
"""Fit the transformer.
78-
79-
This default implementation simply calls ``fit_transform()`` and
80-
returns ``self``.
81-
82-
Subclasses should implement ``fit_transform`` and ``transform``.
83-
84-
Parameters
85-
----------
86-
column : a pandas or polars Series
87-
Unlike most scikit-learn transformers, single-column transformers
88-
transform a single column, not a whole dataframe.
89-
90-
y : column or dataframe
91-
Prediction targets.
92-
93-
**kwargs
94-
Extra named arguments are passed to ``self.fit_transform()``.
95-
96-
Returns
97-
-------
98-
self
99-
The fitted transformer.
100-
"""
101-
self.fit_transform(column, y=y, **kwargs)
102-
return self
103-
104-
def _check_single_column(self, column, function_name):
105-
class_name = self.__class__.__name__
106-
if sbd.is_dataframe(column):
107-
raise ValueError(
108-
f"``{class_name}.{function_name}`` should be passed a single column,"
109-
" not a dataframe. " + _SINGLE_COL_LINE.format(class_name=class_name)
110-
)
111-
if not sbd.is_column(column):
112-
raise ValueError(
113-
f"``{class_name}.{function_name}`` expects the first argument X "
114-
"to be a column (a pandas or polars Series). "
115-
f"Got X with type: {column.__class__.__name__}."
116-
)
117-
return column
118-
119-
def __init_subclass__(subclass, **kwargs):
120-
super().__init_subclass__(**kwargs)
121-
if subclass.__doc__ is not None:
122-
subclass.__doc__ = _insert_after_first_paragraph(
123-
subclass.__doc__,
124-
_SINGLE_COL_NOTE.format(class_name=subclass.__name__),
125-
)
126-
for method in "fit", "fit_transform", "transform", "partial_fit":
127-
if method in subclass.__dict__:
128-
wrapped = _wrap_add_check_single_column(getattr(subclass, method))
129-
setattr(subclass, method, wrapped)
130-
131-
def get_feature_names_out(self, input_features=None):
132-
"""Get the output feature names.
133-
134-
Parameters
135-
-----------
136-
input_features : array-like of str, default=None
137-
Input feature names. Ignored.
138-
139-
Returns
140-
--------
141-
all_outputs_
142-
The names of the output features.
143-
"""
144-
check_is_fitted(self, "all_outputs_")
145-
return self.all_outputs_
146-
147-
148-
def _wrap_add_check_single_column(f):
149-
# as we have only a few predefined functions to handle, using their exact
150-
# name and signature in the wrapper definition gives better tracebacks and
151-
# autocompletion than just functools.wraps / setting __name__ and
152-
# __signature__
153-
if f.__name__ == "fit":
154-
155-
@functools.wraps(f)
156-
def fit(self, X, y=None, **kwargs):
157-
self._check_single_column(X, f.__name__)
158-
return f(self, X, y=y, **kwargs)
159-
160-
return fit
161-
elif f.__name__ == "partial_fit":
162-
163-
@functools.wraps(f)
164-
def partial_fit(self, X, y=None, **kwargs):
165-
self._check_single_column(X, f.__name__)
166-
return f(self, X, y=y, **kwargs)
167-
168-
return partial_fit
169-
170-
elif f.__name__ == "fit_transform":
171-
172-
@functools.wraps(f)
173-
def fit_transform(self, X, y=None, **kwargs):
174-
self._check_single_column(X, f.__name__)
175-
return f(self, X, y=y, **kwargs)
176-
177-
return fit_transform
178-
else:
179-
assert f.__name__ == "transform", f.__name__
180-
181-
@functools.wraps(f)
182-
def transform(self, X, **kwargs):
183-
self._check_single_column(X, f.__name__)
184-
return f(self, X, **kwargs)
185-
186-
return transform
187-
188-
189-
def _insert_after_first_paragraph(document, text_to_insert):
190-
split_doc = document.splitlines(True)
191-
indent = min(
192-
(
193-
len(m.group(1))
194-
for line in split_doc[1:]
195-
if (m := re.match(r"^( *)\S", line)) is not None
196-
),
197-
default=0,
198-
)
199-
doc_lines = iter(split_doc)
200-
output_lines = []
201-
for line in doc_lines:
202-
output_lines.append(line)
203-
if line.strip():
204-
break
205-
for line in doc_lines:
206-
output_lines.append(line)
207-
if not line.strip():
208-
break
209-
else:
210-
output_lines.append("\n")
211-
for line in text_to_insert.splitlines(True):
212-
output_lines.append(line if not line.strip() else " " * indent + line)
213-
output_lines.append("\n")
214-
output_lines.extend(doc_lines)
215-
return "".join(output_lines)
216-
217-
218-
class ApplyToCols(TransformerMixin, BaseEstimator):
219-
"""Map a transformer to columns in a dataframe.
17+
Map a transformer to columns in a dataframe.
22018
22119
A separate clone of the transformer is applied to each column separately.
22220
@@ -371,7 +169,7 @@ class ApplyToCols(TransformerMixin, BaseEstimator):
371169
>>> ToDatetime().fit_transform(df["city"])
372170
Traceback (most recent call last):
373171
...
374-
skrub._apply_to_cols.RejectColumn: Could not find a datetime format for column 'city'.
172+
skrub._single_column_transformer.RejectColumn: Could not find a datetime format for column 'city'.
375173
376174
How these rejections are handled depends on the ``allow_reject`` parameter.
377175
By default, no special handling is performed and rejections are considered

skrub/_clean_categories.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import numpy as np
22

33
from . import _dataframe as sbd
4-
from ._apply_to_cols import RejectColumn, SingleColumnTransformer
54
from ._dispatch import dispatch, raise_dispatch_unregistered_type
5+
from ._single_column_transformer import RejectColumn, SingleColumnTransformer
66

77
__all__ = ["CleanCategories"]
88

@@ -150,7 +150,7 @@ class CleanCategories(SingleColumnTransformer):
150150
>>> cleaner.fit_transform(s)
151151
Traceback (most recent call last):
152152
...
153-
skrub._apply_to_cols.RejectColumn: Column 'c' is not categorical.
153+
skrub._single_column_transformer.RejectColumn: Column 'c' is not categorical.
154154
155155
However once a column has been accepted, the output of ``transform`` will
156156
always have a categorical dtype:

skrub/_clean_null_strings.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from . import _dataframe as sbd
2-
from ._apply_to_cols import RejectColumn, SingleColumnTransformer
32
from ._dispatch import dispatch, raise_dispatch_unregistered_type
3+
from ._single_column_transformer import RejectColumn, SingleColumnTransformer
44

55
__all__ = ["CleanNullStrings"]
66

@@ -162,7 +162,7 @@ class CleanNullStrings(SingleColumnTransformer):
162162
>>> ToFloat().fit_transform(s)
163163
Traceback (most recent call last):
164164
...
165-
skrub._apply_to_cols.RejectColumn: Could not convert column 's' to numbers.
165+
skrub._single_column_transformer.RejectColumn: Could not convert column 's' to numbers.
166166
>>> ToFloat().fit_transform(cleaner.fit_transform(s))
167167
0 1.1
168168
1 2.2
@@ -177,7 +177,7 @@ class CleanNullStrings(SingleColumnTransformer):
177177
>>> cleaner.fit_transform(s)
178178
Traceback (most recent call last):
179179
...
180-
skrub._apply_to_cols.RejectColumn: Column 's' does not contain strings.
180+
skrub._single_column_transformer.RejectColumn: Column 's' does not contain strings.
181181
182182
In particular, Categorical columns, although they contain strings, do not
183183
have the ``string`` or ``object`` ``dtype``:
@@ -186,7 +186,7 @@ class CleanNullStrings(SingleColumnTransformer):
186186
>>> cleaner.fit_transform(s)
187187
Traceback (most recent call last):
188188
...
189-
skrub._apply_to_cols.RejectColumn: Column None does not contain strings.
189+
skrub._single_column_transformer.RejectColumn: Column None does not contain strings.
190190
191191
Note however that ``object`` columns are accepted even if they do not
192192
contain any strings. They will not be modified but they will still be
@@ -229,7 +229,7 @@ class CleanNullStrings(SingleColumnTransformer):
229229
>>> cleaner.fit_transform(s)
230230
Traceback (most recent call last):
231231
...
232-
skrub._apply_to_cols.RejectColumn: Column 's' does not contain strings.
232+
skrub._single_column_transformer.RejectColumn: Column 's' does not contain strings.
233233
"""
234234

235235
def fit_transform(self, column, y=None):

skrub/_datetime_encoder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from sklearn.utils.validation import check_is_fitted
77

88
from . import _dataframe as sbd
9-
from ._apply_to_cols import RejectColumn, SingleColumnTransformer
109
from ._dispatch import dispatch
10+
from ._single_column_transformer import RejectColumn, SingleColumnTransformer
1111
from ._sklearn_compat import TransformerTags
1212

1313
__all__ = ["DatetimeEncoder"]
@@ -257,7 +257,7 @@ class DatetimeEncoder(SingleColumnTransformer):
257257
>>> DatetimeEncoder().fit_transform(s)
258258
Traceback (most recent call last):
259259
...
260-
skrub._apply_to_cols.RejectColumn: Column 'birthday' does not have Date or Datetime dtype.
260+
skrub._single_column_transformer.RejectColumn: Column 'birthday' does not have Date or Datetime dtype.
261261
262262
:class:`ToDatetime`: can be used for converting strings to datetimes.
263263

skrub/_dispatch.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,13 +253,15 @@ def raise_dispatch_unregistered_type(obj, kind="object"):
253253
from ._data_ops import DataOp
254254

255255
if isinstance(obj, DataOp):
256-
raise TypeError(f"""Expected a Pandas or Polars {kind}, but got a skrub DataOp.
256+
raise TypeError(
257+
f"""Expected a Pandas or Polars {kind}, but got a skrub DataOp.
257258
A function that expects an actual value cannot be applied directly to a DataOp;
258259
you may want to (i) use op.skb.eval() or op.skb.preview() to evaluate the
259260
dataop and turn it into an actual value or (ii) use op.skb.apply_func() or
260261
op.skb.apply() to schedule the operation for later execution (when the dataop is
261262
evaluated) rather than computing it immediately.
262-
""")
263+
"""
264+
)
263265
raise TypeError(
264266
"Operation not supported on this object. Expecting a Pandas or Polars "
265267
f"{kind}, but got an object of type {type(obj)}."

skrub/_drop_uninformative.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from sklearn.utils.validation import check_is_fitted
44

55
from . import _dataframe as sbd
6-
from ._apply_to_cols import SingleColumnTransformer
6+
from ._single_column_transformer import SingleColumnTransformer
77

88
__all__ = ["DropUninformative"]
99

0 commit comments

Comments
 (0)