-
Notifications
You must be signed in to change notification settings - Fork 200
FIX - CI OOM issue and some tests failing due to pandas 3.0 #1855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cff24a4
e9343e7
a12d817
7eab7ad
54b69b8
d5356b5
d10dfc1
7bf42d7
151f3d5
95a55ee
79c7cd2
c799f4f
5dea689
d9875fd
1bcff05
eac7141
f20be26
fca129d
6de904c
caa587d
f367920
23f1da3
c50e631
336e1a8
4a33982
e35b605
bc0d8a9
2cb8f7d
9a10bc5
ec00402
c699338
69d7c93
b4b2fcf
4c3d304
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,7 +64,6 @@ | |
| # builtin | ||
| "sphinx.ext.autodoc", | ||
| "sphinx.ext.autosummary", | ||
| "sphinx.ext.doctest", | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doctests are already being run as part of the test matrix |
||
| "sphinx.ext.intersphinx", | ||
| "sphinx.ext.mathjax", | ||
| "sphinx.ext.githubpages", | ||
|
|
@@ -396,24 +395,15 @@ def notebook_modification_function(notebook_content, notebook_filename): | |
| "</div>", | ||
| ] | ||
| ) | ||
| # TODO: remove this when we remove KEN embeddings | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should also have been removed in #1567 |
||
| if "06_ken_embeddings_example" in notebook_filename: | ||
| message_class = "danger" | ||
| message = ( | ||
| "This example requires PyArrow, which is currently unavailable in Pyodide" | ||
| " (see https://github.com/pyodide/pyodide/issues/2933). Thus, this example" | ||
| " cannot be run in JupyterLite." | ||
| ) | ||
| else: | ||
| message_class = "warning" | ||
| message = ( | ||
| "Running the skrub examples in JupyterLite is experimental and you may" | ||
| "encounter some unexpected behavior.\n\n" | ||
| "The main difference is that imports will take a lot longer than usual, " | ||
| "for example the first `import skrub` can take roughly 10-20s.\n\n" | ||
| "If you notice problems, feel free to open an " | ||
| "[issue](https://github.com/skrub-data/skrub/issues/new/choose) about it." | ||
| ) | ||
| message_class = "warning" | ||
| message = ( | ||
| "Running the skrub examples in JupyterLite is experimental and you may" | ||
| "encounter some unexpected behavior.\n\n" | ||
| "The main difference is that imports will take a lot longer than usual, " | ||
| "for example the first `import skrub` can take roughly 10-20s.\n\n" | ||
| "If you notice problems, feel free to open an " | ||
| "[issue](https://github.com/skrub-data/skrub/issues/new/choose) about it." | ||
| ) | ||
|
|
||
| markdown = warning_template.format(message_class=message_class, message=message) | ||
|
|
||
|
|
@@ -467,6 +457,13 @@ def reset_skrub_config(gallery_conf, fname): | |
| skrub.set_config(**default_global_config) | ||
|
|
||
|
|
||
| def call_garbage_collector(gallery_conf, fname): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should help reduce a bit the memory pressure |
||
| """Call the garbage collector to free memory after each example.""" | ||
| import gc | ||
|
|
||
| gc.collect() | ||
|
|
||
|
|
||
| sphinx_gallery_conf = { | ||
| "doc_module": "skrub", | ||
| "backreferences_dir": os.path.join("reference/generated"), | ||
|
|
@@ -490,7 +487,8 @@ def reset_skrub_config(gallery_conf, fname): | |
| "use_jupyter_lab": True, | ||
| }, | ||
| "default_thumb_file": "./_static/skrub.svg", | ||
| "reset_modules": (reset_skrub_config,), | ||
| "reset_modules": (reset_skrub_config, call_garbage_collector), | ||
| "show_memory": True, | ||
| } | ||
| if with_jupyterlite: | ||
| sphinx_gallery_conf["jupyterlite"] = { | ||
|
|
@@ -555,9 +553,6 @@ def reset_skrub_config(gallery_conf, fname): | |
| "pandas.melt": "pandas.melt", | ||
| "pandas.merge": "pandas.merge", | ||
| # Skrub | ||
| "fetch_ken_table_aliases": "skrub.datasets.fetch_ken_table_aliases", | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again #1567 |
||
| "fetch_ken_types": "skrub.datasets.fetch_ken_types", | ||
| "fetch_ken_embeddings": "skrub.datasets.fetch_ken_embeddings", | ||
| "fuzzy_join": "skrub.fuzzy_join", | ||
| "Joiner": "skrub.Joiner", | ||
| "AggJoiner": "skrub.AggJoiner", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,9 +35,9 @@ Skrub provides helpers to parse datetime string columns automatically: | |
| >>> s = pd.Series(["2024-05-05T13:17:52", None, "2024-05-07T13:17:52"], name="when") | ||
| >>> s | ||
| 0 2024-05-05T13:17:52 | ||
| 1 None | ||
| 1 ... | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pandas 3.0 has NaN here |
||
| 2 2024-05-07T13:17:52 | ||
| Name: when, dtype: object | ||
| Name: when, dtype: ... | ||
|
|
||
| >>> from skrub import ToDatetime | ||
|
|
||
|
|
@@ -246,15 +246,15 @@ to reduce redundancy: | |
| >>> encoder.fit_transform(login).columns | ||
| Index(['login_year', 'login_month', 'login_day', 'login_hour', | ||
| 'login_total_seconds'], | ||
| dtype='object') | ||
| dtype=...) | ||
| >>> from sklearn.pipeline import make_pipeline | ||
| >>> encoder = make_pipeline(ToDatetime(), DatetimeEncoder(periodic_encoding="circular")) | ||
| >>> encoder.fit_transform(login).columns | ||
| Index(['login_year', 'login_total_seconds', 'login_month_circular_0', | ||
| 'login_month_circular_1', 'login_day_circular_0', | ||
| 'login_day_circular_1', 'login_hour_circular_0', | ||
| 'login_hour_circular_1'], | ||
| dtype='object') | ||
| dtype=...) | ||
|
|
||
|
|
||
| The |DatetimeEncoder| uses hardcoded values for generating periodic features. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,29 +15,7 @@ to be scaled at the same time. | |
| While the heuristics used by the :class:`TableVectorizer` are usually good enough | ||
| to apply the proper transformers to different datatypes, using it may not be an | ||
| option in all cases. In scikit-learn pipelines, the column selection operation can | ||
| is done with the :class:`sklearn.compose.ColumnTransformer`: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this code block can't run without failing because pandas 3.0 raises an exception on the min reqs pandas won't accept |
||
|
|
||
|
|
||
| >>> import pandas as pd | ||
| >>> from sklearn.compose import make_column_selector as selector | ||
| >>> from sklearn.compose import make_column_transformer | ||
| >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder | ||
| >>> | ||
| >>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]}) | ||
| >>> | ||
| >>> categorical_columns = selector(dtype_include=object)(df) | ||
| >>> numerical_columns = selector(dtype_exclude=object)(df) | ||
| >>> | ||
| >>> ct = make_column_transformer( | ||
| ... (StandardScaler(), | ||
| ... numerical_columns), | ||
| ... (OneHotEncoder(handle_unknown="ignore"), | ||
| ... categorical_columns)) | ||
| >>> transformed = ct.fit_transform(df) | ||
| >>> transformed | ||
| array([[-1.22474487, 0. , 0. , 1. ], | ||
| [ 0. , 1. , 0. , 0. ], | ||
| [ 1.22474487, 0. , 1. , 0. ]]) | ||
| is done with the :class:`sklearn.compose.ColumnTransformer`. | ||
|
|
||
| Skrub provides alternative transformers that can achieve the same results: | ||
|
|
||
|
|
@@ -69,6 +47,9 @@ example above, which can be rewritten with |ApplyToCols| as follows: | |
| >>> import skrub.selectors as s | ||
| >>> from sklearn.pipeline import make_pipeline | ||
| >>> from skrub import ApplyToCols | ||
| >>> from sklearn.preprocessing import OneHotEncoder, StandardScaler | ||
| >>> import pandas as pd | ||
| >>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]}) | ||
| >>> | ||
| >>> numeric = ApplyToCols(StandardScaler(), cols=s.numeric()) | ||
| >>> string = ApplyToCols(OneHotEncoder(sparse_output=False), cols=s.string()) | ||
|
|
@@ -89,9 +70,9 @@ column: | |
| birthday city | ||
| 0 29/01/2024 London | ||
| >>> df.dtypes | ||
| birthday object | ||
| city object | ||
| dtype: object | ||
| birthday ... | ||
| city ... | ||
| dtype: ... | ||
| >>> ToDatetime().fit_transform(df["birthday"]) | ||
| 0 2024-01-29 | ||
| Name: birthday, dtype: datetime64[...] | ||
|
|
@@ -129,8 +110,8 @@ datetime column. | |
|
|
||
| >>> transformed.dtypes | ||
| birthday datetime64[...] | ||
| city object | ||
| dtype: object | ||
| city ... | ||
| dtype: ... | ||
|
|
||
|
|
||
| |ApplyToFrame| is instead used in cases where multiple columns should be transformed | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these lines should have been removed in #1567