|
1 | | -import functools |
2 | 1 | import itertools |
3 | | -import re |
4 | | -import textwrap |
5 | 2 |
|
6 | 3 | from joblib import Parallel, delayed |
7 | 4 | from sklearn.base import BaseEstimator, TransformerMixin, clone |
|
10 | 7 | from . import _dataframe as sbd |
11 | 8 | from . import _utils, selectors |
12 | 9 | from ._join_utils import pick_column_names |
| 10 | +from ._single_column_transformer import RejectColumn |
13 | 11 |
|
14 | | -__all__ = ["ApplyToCols", "SingleColumnTransformer", "RejectColumn"] |
| 12 | +__all__ = ["ApplyToCols"] |
15 | 13 |
|
16 | | -_SINGLE_COL_LINE = ( |
17 | | - "``{class_name}`` is a type of single-column transformer. Unlike most scikit-learn" |
18 | | - " estimators, its ``fit``, ``transform`` and ``fit_transform`` methods expect a" |
19 | | - " single column (a pandas or polars Series) rather than a full dataframe. To apply" |
20 | | - " this transformer to one or more columns in a dataframe, use it as a parameter in" |
21 | | - " a ``skrub.ApplyToCols`` or a ``skrub.TableVectorizer``." |
22 | | - " To apply to all columns::\n\n" |
23 | | - " ApplyToCols({class_name}())\n\n" |
24 | | - "To apply to selected columns::\n\n" |
25 | | - " ApplyToCols({class_name}(), cols=['col_name_1', 'col_name_2'])" |
26 | | -) |
27 | | -_SINGLE_COL_PARAGRAPH = textwrap.fill( |
28 | | - _SINGLE_COL_LINE, initial_indent=" ", subsequent_indent=" " |
29 | | -) |
30 | | -_SINGLE_COL_NOTE = f".. note::\n\n{_SINGLE_COL_PARAGRAPH}\n" |
31 | 14 |
|
32 | | - |
33 | | -class RejectColumn(ValueError): |
34 | | - """Used by single-column transformers to indicate they do not apply to a column. |
35 | | -
|
36 | | - >>> import pandas as pd |
37 | | - >>> from skrub import ToDatetime |
38 | | - >>> df = pd.DataFrame(dict(a=['2020-02-02'], b=[12.5])) |
39 | | - >>> ToDatetime().fit_transform(df['a']) |
40 | | - 0 2020-02-02 |
41 | | - Name: a, dtype: datetime64[...] |
42 | | - >>> ToDatetime().fit_transform(df['b']) |
43 | | - Traceback (most recent call last): |
44 | | - ... |
45 | | - skrub._apply_to_cols.RejectColumn: Column 'b' does not contain strings. |
| 15 | +class ApplyToCols(BaseEstimator, TransformerMixin): |
46 | 16 | """ |
47 | | - |
48 | | - pass |
49 | | - |
50 | | - |
51 | | -class SingleColumnTransformer(BaseEstimator): |
52 | | - """Base class for single-column transformers. |
53 | | -
|
54 | | - Such transformers are applied independently to each column by |
55 | | - ``ApplyToCols``; see the docstring of ``ApplyToCols`` for more |
56 | | - information. |
57 | | -
|
58 | | - Single-column transformers are not required to inherit from this class in |
59 | | - order to work with ``ApplyToCols``, however doing so avoids some |
60 | | - boilerplate: |
61 | | -
|
62 | | - - The required ``__single_column_transformer__`` attribute is set. |
63 | | - - ``fit`` is defined (calls ``fit_transform`` and discards the result). |
64 | | - - ``fit``, ``transform`` and ``fit_transform`` are wrapped to check |
65 | | - that the input is a single column and raise a ``ValueError`` with a |
66 | | - helpful message when it is not. |
67 | | - - A note about single-column transformers (vs dataframe transformers) |
68 | | - is added after the summary line of the docstring. |
69 | | -
|
70 | | - Subclasses must define ``fit_transform`` and ``transform`` (or inherit them |
71 | | - from another superclass). |
72 | | - """ |
73 | | - |
74 | | - __single_column_transformer__ = True |
75 | | - |
76 | | - def fit(self, column, y=None, **kwargs): |
77 | | - """Fit the transformer. |
78 | | -
|
79 | | - This default implementation simply calls ``fit_transform()`` and |
80 | | - returns ``self``. |
81 | | -
|
82 | | - Subclasses should implement ``fit_transform`` and ``transform``. |
83 | | -
|
84 | | - Parameters |
85 | | - ---------- |
86 | | - column : a pandas or polars Series |
87 | | - Unlike most scikit-learn transformers, single-column transformers |
88 | | - transform a single column, not a whole dataframe. |
89 | | -
|
90 | | - y : column or dataframe |
91 | | - Prediction targets. |
92 | | -
|
93 | | - **kwargs |
94 | | - Extra named arguments are passed to ``self.fit_transform()``. |
95 | | -
|
96 | | - Returns |
97 | | - ------- |
98 | | - self |
99 | | - The fitted transformer. |
100 | | - """ |
101 | | - self.fit_transform(column, y=y, **kwargs) |
102 | | - return self |
103 | | - |
104 | | - def _check_single_column(self, column, function_name): |
105 | | - class_name = self.__class__.__name__ |
106 | | - if sbd.is_dataframe(column): |
107 | | - raise ValueError( |
108 | | - f"``{class_name}.{function_name}`` should be passed a single column," |
109 | | - " not a dataframe. " + _SINGLE_COL_LINE.format(class_name=class_name) |
110 | | - ) |
111 | | - if not sbd.is_column(column): |
112 | | - raise ValueError( |
113 | | - f"``{class_name}.{function_name}`` expects the first argument X " |
114 | | - "to be a column (a pandas or polars Series). " |
115 | | - f"Got X with type: {column.__class__.__name__}." |
116 | | - ) |
117 | | - return column |
118 | | - |
119 | | - def __init_subclass__(subclass, **kwargs): |
120 | | - super().__init_subclass__(**kwargs) |
121 | | - if subclass.__doc__ is not None: |
122 | | - subclass.__doc__ = _insert_after_first_paragraph( |
123 | | - subclass.__doc__, |
124 | | - _SINGLE_COL_NOTE.format(class_name=subclass.__name__), |
125 | | - ) |
126 | | - for method in "fit", "fit_transform", "transform", "partial_fit": |
127 | | - if method in subclass.__dict__: |
128 | | - wrapped = _wrap_add_check_single_column(getattr(subclass, method)) |
129 | | - setattr(subclass, method, wrapped) |
130 | | - |
131 | | - def get_feature_names_out(self, input_features=None): |
132 | | - """Get the output feature names. |
133 | | -
|
134 | | - Parameters |
135 | | - ----------- |
136 | | - input_features : array-like of str, default=None |
137 | | - Input feature names. Ignored. |
138 | | -
|
139 | | - Returns |
140 | | - -------- |
141 | | - all_outputs_ |
142 | | - The names of the output features. |
143 | | - """ |
144 | | - check_is_fitted(self, "all_outputs_") |
145 | | - return self.all_outputs_ |
146 | | - |
147 | | - |
148 | | -def _wrap_add_check_single_column(f): |
149 | | - # as we have only a few predefined functions to handle, using their exact |
150 | | - # name and signature in the wrapper definition gives better tracebacks and |
151 | | - # autocompletion than just functools.wraps / setting __name__ and |
152 | | - # __signature__ |
153 | | - if f.__name__ == "fit": |
154 | | - |
155 | | - @functools.wraps(f) |
156 | | - def fit(self, X, y=None, **kwargs): |
157 | | - self._check_single_column(X, f.__name__) |
158 | | - return f(self, X, y=y, **kwargs) |
159 | | - |
160 | | - return fit |
161 | | - elif f.__name__ == "partial_fit": |
162 | | - |
163 | | - @functools.wraps(f) |
164 | | - def partial_fit(self, X, y=None, **kwargs): |
165 | | - self._check_single_column(X, f.__name__) |
166 | | - return f(self, X, y=y, **kwargs) |
167 | | - |
168 | | - return partial_fit |
169 | | - |
170 | | - elif f.__name__ == "fit_transform": |
171 | | - |
172 | | - @functools.wraps(f) |
173 | | - def fit_transform(self, X, y=None, **kwargs): |
174 | | - self._check_single_column(X, f.__name__) |
175 | | - return f(self, X, y=y, **kwargs) |
176 | | - |
177 | | - return fit_transform |
178 | | - else: |
179 | | - assert f.__name__ == "transform", f.__name__ |
180 | | - |
181 | | - @functools.wraps(f) |
182 | | - def transform(self, X, **kwargs): |
183 | | - self._check_single_column(X, f.__name__) |
184 | | - return f(self, X, **kwargs) |
185 | | - |
186 | | - return transform |
187 | | - |
188 | | - |
189 | | -def _insert_after_first_paragraph(document, text_to_insert): |
190 | | - split_doc = document.splitlines(True) |
191 | | - indent = min( |
192 | | - ( |
193 | | - len(m.group(1)) |
194 | | - for line in split_doc[1:] |
195 | | - if (m := re.match(r"^( *)\S", line)) is not None |
196 | | - ), |
197 | | - default=0, |
198 | | - ) |
199 | | - doc_lines = iter(split_doc) |
200 | | - output_lines = [] |
201 | | - for line in doc_lines: |
202 | | - output_lines.append(line) |
203 | | - if line.strip(): |
204 | | - break |
205 | | - for line in doc_lines: |
206 | | - output_lines.append(line) |
207 | | - if not line.strip(): |
208 | | - break |
209 | | - else: |
210 | | - output_lines.append("\n") |
211 | | - for line in text_to_insert.splitlines(True): |
212 | | - output_lines.append(line if not line.strip() else " " * indent + line) |
213 | | - output_lines.append("\n") |
214 | | - output_lines.extend(doc_lines) |
215 | | - return "".join(output_lines) |
216 | | - |
217 | | - |
218 | | -class ApplyToCols(TransformerMixin, BaseEstimator): |
219 | | - """Map a transformer to columns in a dataframe. |
| 17 | + Map a transformer to columns in a dataframe. |
220 | 18 |
|
221 | 19 | A separate clone of the transformer is applied to each column separately. |
222 | 20 |
|
@@ -371,7 +169,7 @@ class ApplyToCols(TransformerMixin, BaseEstimator): |
371 | 169 | >>> ToDatetime().fit_transform(df["city"]) |
372 | 170 | Traceback (most recent call last): |
373 | 171 | ... |
374 | | - skrub._apply_to_cols.RejectColumn: Could not find a datetime format for column 'city'. |
| 172 | + skrub._single_column_transformer.RejectColumn: Could not find a datetime format for column 'city'. |
375 | 173 |
|
376 | 174 | How these rejections are handled depends on the ``allow_reject`` parameter. |
377 | 175 | By default, no special handling is performed and rejections are considered |
|
0 commit comments