FIX - CI OOM issue and some tests failing due to pandas 3.0 #1855

rcap107 · 2026-01-28T13:36:19Z

these lines should have been removed in #1567

rcap107 · 2026-01-28T13:36:38Z

doctests are already being run as part of the test matrix

rcap107 · 2026-01-28T13:36:53Z

this should also have been removed in #1567

rcap107 · 2026-01-28T13:37:13Z

this should help reduce a bit the memory pressure

rcap107 · 2026-01-28T13:37:25Z

again #1567

rcap107 · 2026-01-28T13:37:44Z

pandas 3.0 has NaN here

rcap107 · 2026-01-28T13:38:58Z

this code block can't run without failing because pandas 3.0 raises an exception on the dtype_include=object lines, asking to put str instead

min reqs pandas won't accept str as parameter, and just fails

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ jobs: @@
       python3:
         docker:
           - image: cimg/python:3.10
+        resource_class: "medium+" # 6gb memory so we don't get OOM killed #1855
         environment:
           - MINICONDA_PATH: ~/miniconda
           - CONDA_ENV_NAME: testenv
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,6 @@ endif @@
     ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\
         $(EXAMPLES_PATTERN_OPTS) .
     # Put it first so that "make" without argument is like "make help".
     help:
     	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ Expand All / @@ -29,16 +28,15 @@ html: @@
     	# the embedding of images more robust
     	rm -rf $(BUILDDIR)/html/_images
     	#rm -rf _build/doctrees/
-    	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+    	SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
     	@echo
     	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
     html-noplot:
-    	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+    	SKB_TABLE_REPORT_VERBOSITY=0 $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
     	@echo
     	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
     linkcheck:
     	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
     	@echo
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -306,9 +306,6 @@ @@
                         "datasets.fetch_drug_directory",
                         "datasets.fetch_employee_salaries",
                         "datasets.fetch_flight_delays",
-                        "datasets.fetch_ken_embeddings",
-                        "datasets.fetch_ken_table_aliases",
-                        "datasets.fetch_ken_types",
                         "datasets.fetch_medical_charge",
                         "datasets.fetch_midwest_survey",
                         "datasets.fetch_movielens",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -64,7 +64,6 @@ @@
         # builtin
         "sphinx.ext.autodoc",
         "sphinx.ext.autosummary",
-        "sphinx.ext.doctest",
         "sphinx.ext.intersphinx",
         "sphinx.ext.mathjax",
         "sphinx.ext.githubpages",
@@ Expand Down Expand Up @@
                 "</div>",
             ]
         )
-        # TODO: remove this when we remove KEN embeddings
-        if "06_ken_embeddings_example" in notebook_filename:
-            message_class = "danger"
-            message = (
-                "This example requires PyArrow, which is currently unavailable in Pyodide"
-                " (see https://github.com/pyodide/pyodide/issues/2933). Thus, this example"
-                " cannot be run in JupyterLite."
-            )
-        else:
-            message_class = "warning"
-            message = (
-                "Running the skrub examples in JupyterLite is experimental and you may"
-                "encounter some unexpected behavior.\n\n"
-                "The main difference is that imports will take a lot longer than usual, "
-                "for example the first `import skrub` can take roughly 10-20s.\n\n"
-                "If you notice problems, feel free to open an "
-                "[issue](https://github.com/skrub-data/skrub/issues/new/choose) about it."
-            )
+        message_class = "warning"
+        message = (
+            "Running the skrub examples in JupyterLite is experimental and you may"
+            "encounter some unexpected behavior.\n\n"
+            "The main difference is that imports will take a lot longer than usual, "
+            "for example the first `import skrub` can take roughly 10-20s.\n\n"
+            "If you notice problems, feel free to open an "
+            "[issue](https://github.com/skrub-data/skrub/issues/new/choose) about it."
+        )
         markdown = warning_template.format(message_class=message_class, message=message)
@@ Expand Down Expand Up / @@ -467,6 +457,13 @@ def reset_skrub_config(gallery_conf, fname): @@
         skrub.set_config(**default_global_config)
+    def call_garbage_collector(gallery_conf, fname):
+        """Call the garbage collector to free memory after each example."""
+        import gc
+        gc.collect()
     sphinx_gallery_conf = {
         "doc_module": "skrub",
         "backreferences_dir": os.path.join("reference/generated"),
@@ Expand All / @@ -490,7 +487,8 @@ def reset_skrub_config(gallery_conf, fname): @@
             "use_jupyter_lab": True,
         },
         "default_thumb_file": "./_static/skrub.svg",
-        "reset_modules": (reset_skrub_config,),
+        "reset_modules": (reset_skrub_config, call_garbage_collector),
+        "show_memory": True,
     }
     if with_jupyterlite:
         sphinx_gallery_conf["jupyterlite"] = {
@@ Expand Down Expand Up / @@ -555,9 +553,6 @@ def reset_skrub_config(gallery_conf, fname): @@
         "pandas.melt": "pandas.melt",
         "pandas.merge": "pandas.merge",
         # Skrub
-        "fetch_ken_table_aliases": "skrub.datasets.fetch_ken_table_aliases",
-        "fetch_ken_types": "skrub.datasets.fetch_ken_types",
-        "fetch_ken_embeddings": "skrub.datasets.fetch_ken_embeddings",
         "fuzzy_join": "skrub.fuzzy_join",
         "Joiner": "skrub.Joiner",
         "AggJoiner": "skrub.AggJoiner",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
     >>> s = pd.Series(["2024-05-05T13:17:52", None, "2024-05-07T13:17:52"], name="when")
     >>> s
 2024-05-05T13:17:52
-None
+...
 2024-05-07T13:17:52
-    Name: when, dtype: object
+    Name: when, dtype: ...
     >>> from skrub import ToDatetime
@@ Expand Down Expand Up / @@ -246,15 +246,15 @@ to reduce redundancy: @@
     >>> encoder.fit_transform(login).columns
     Index(['login_year', 'login_month', 'login_day', 'login_hour',
            'login_total_seconds'],
-          dtype='object')
+          dtype=...)
     >>> from sklearn.pipeline import make_pipeline
     >>> encoder = make_pipeline(ToDatetime(), DatetimeEncoder(periodic_encoding="circular"))
     >>> encoder.fit_transform(login).columns
     Index(['login_year', 'login_total_seconds', 'login_month_circular_0',
            'login_month_circular_1', 'login_day_circular_0',
            'login_day_circular_1', 'login_hour_circular_0',
            'login_hour_circular_1'],
-          dtype='object')
+          dtype=...)
     The |DatetimeEncoder| uses hardcoded values for generating periodic features.
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FIX - CI OOM issue and some tests failing due to pandas 3.0 #1855

Uh oh!

Diff view

Diff view

There are no files selected for viewing

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

rcap107 Jan 28, 2026

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -39,7 +39,7 @@ white white @@
     white    white
     white    white
     white    white
-    dtype: object
+    dtype: ...
     >>> deduplicated = list(deduplicate_correspondence)
     >>> deduplicated # doctest: +SKIP
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -42,7 +42,7 @@ see a result for that value: @@
     <GetAttr 'columns'>
     Result:
     ―――――――
-    Index(['item', 'price', 'qty'], dtype='object')
+    Index(['item', 'price', 'qty'], dtype=...)
     The "result" we see is an *example* result that the computation produces for the
     data we provided. But we want to fit our pipeline and apply it to different
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -64,8 +64,8 @@ The |Cleaner| is a scikit-learn compatible transformer: @@
 3 2024-05-07
     >>> df_clean.dtypes
     id               int64
-    date    datetime64[ns]
-    dtype: object
+    date    datetime64[...]
+    dtype:  ...
     Note that the ``"all_missing"`` column has been dropped, and that the ``"date"``
     column has been correctly parsed as a datetime column.
@@ Expand All / @@ -89,11 +89,11 @@ the ``numeric_dtype`` parameter: @@
     ... })
     >>> df.dtypes
     id    int64
-    dtype: object
+    dtype: ...
     >>> df_cleaned = cleaner.fit_transform(df)
     >>> df_cleaned.dtypes
     id    float32
-    dtype: object
+    dtype: ...
     Setting the dtype to ``float32`` reduces RAM footprint for most use cases and
     ensures that all missing values have the same representation. This also ensures
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -15,29 +15,7 @@ to be scaled at the same time. @@
     While the heuristics used by the :class:`TableVectorizer` are usually good enough
     to apply the proper transformers to different datatypes, using it may not be an
     option in all cases. In scikit-learn pipelines, the column selection operation can
-    is done with the :class:`sklearn.compose.ColumnTransformer`:
-    >>> import pandas as pd
-    >>> from sklearn.compose import make_column_selector as selector
-    >>> from sklearn.compose import make_column_transformer
-    >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
-    >>>
-    >>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]})
-    >>>
-    >>> categorical_columns = selector(dtype_include=object)(df)
-    >>> numerical_columns = selector(dtype_exclude=object)(df)
-    >>>
-    >>> ct = make_column_transformer(
-    ...       (StandardScaler(),
-    ...        numerical_columns),
-    ...       (OneHotEncoder(handle_unknown="ignore"),
-    ...        categorical_columns))
-    >>> transformed = ct.fit_transform(df)
-    >>> transformed
-    array([[-1.22474487,  0.        ,  0.        ,  1.        ],
-           [ 0.        ,  1.        ,  0.        ,  0.        ],
-           [ 1.22474487,  0.        ,  1.        ,  0.        ]])
+    is done with the :class:`sklearn.compose.ColumnTransformer`.
     Skrub provides alternative transformers that can achieve the same results:
@@ Expand Down Expand Up @@
     >>> import skrub.selectors as s
     >>> from sklearn.pipeline import make_pipeline
     >>> from skrub import ApplyToCols
+    >>> from sklearn.preprocessing import OneHotEncoder, StandardScaler
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({"text": ["foo", "bar", "baz"], "number": [1, 2, 3]})
     >>>
     >>> numeric = ApplyToCols(StandardScaler(), cols=s.numeric())
     >>> string = ApplyToCols(OneHotEncoder(sparse_output=False), cols=s.string())
@@ Expand All / @@ -89,9 +70,9 @@ column: @@
         birthday    city
 29/01/2024  London
     >>> df.dtypes
-    birthday    object
-    city        object
-    dtype: object
+    birthday    ...
+    city        ...
+    dtype: ...
     >>> ToDatetime().fit_transform(df["birthday"])
 2024-01-29
     Name: birthday, dtype: datetime64[...]
@@ Expand Down Expand Up / @@ -129,8 +110,8 @@ datetime column. @@
     >>> transformed.dtypes
     birthday    datetime64[...]
-    city                object
-    dtype: object
+    city                ...
+    dtype: ...
     |ApplyToFrame| is instead used in cases where multiple columns should be transformed
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -84,11 +84,13 @@ @@
     # ------------------------------------
     #
     # We start by creating skrub variables, which are the inputs to our plan.
+    # Notice that we load only a fraction of the baskets table to speed up
+    # execution time and reduce memory usage for this example
     # In our example, we create two skrub |var| objects: ``products`` and ``baskets``:
     # %%
     products = skrub.var("products", dataset.products)
-    baskets = skrub.var("baskets", dataset.baskets)
+    baskets = skrub.var("baskets", dataset.baskets.sample(frac=0.1, random_state=42))
     basket_ids = baskets[["ID"]].skb.mark_as_X()
     fraud_flags = baskets["fraud_flag"].skb.mark_as_y()
@@ Expand All / @@ -103,7 +105,6 @@ @@
     # For instance, we filter products to keep only those that match one of the
     # baskets in the ``baskets`` table, and then add a column containing the total
     # amount for each kind of product in a basket:
     # %%
     kept_products = products[products["basket_ID"].isin(basket_ids["ID"])]
     products_with_total = kept_products.assign(
@@ Expand Down @@

FIX - CI OOM issue and some tests failing due to pandas 3.0 #1855

Are you sure you want to change the base?

Uh oh!

FIX - CI OOM issue and some tests failing due to pandas 3.0 #1855

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

rcap107 Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!