pydata · AAlexxis222 · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 14, 2025
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -14,6 +14,10 @@ v2026.03.0 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Added ``max_concurrency`` parameter to :py:func:`open_datatree` to control
+  the maximum number of concurrent I/O operations when opening groups in parallel
+  with the Zarr backend (:pull:`10742`).
+  By `Alfonso Ladino <https://github.com/aladinor>`_.
 
 Breaking Changes
 ~~~~~~~~~~~~~~~~
@@ -239,6 +243,9 @@ Documentation
 Performance
 ~~~~~~~~~~~
 
+- Improve performance of :py:func:`open_datatree` for zarr stores by using async/concurrent
+  loading of groups and indexes (:pull:`10742`).
+  By `Alfonso Ladino <https://github.com/aladinor>`_.
 - Add a fastpath to the backend plugin system for standard engines (:issue:`10178`, :pull:`10937`).
   By `Sam Levang <https://github.com/slevang>`_.
 - Optimize :py:class:`~xarray.coding.variables.CFMaskCoder` decoder (:pull:`11105`).

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -65,6 +65,7 @@
         NestedSequence,
         T_Chunks,
     )
+    from xarray.core.variable import Variable
 
     T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"]
     T_Engine = Union[
@@ -349,7 +350,37 @@ def _datatree_from_backend_datatree(
 
     _protect_datatree_variables_inplace(backend_tree, cache)
     if create_default_indexes:
-        tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
+        _use_zarr_async = False
+        if engine == "zarr":
+            from xarray.backends.zarr import _zarr_v3
+
+            _use_zarr_async = _zarr_v3()
+
+        if _use_zarr_async:
+            from zarr.core.sync import sync as zarr_sync
+
+            async def create_indexes_async() -> dict[str, Dataset]:
+                import asyncio
+
+                results: dict[str, Dataset] = {}
+                tasks = [
+                    _create_index_for_node(path, node.dataset)
+                    for path, [node] in group_subtrees(backend_tree)
+                ]
+                for fut in asyncio.as_completed(tasks):
+                    path, ds = await fut
+                    results[path] = ds
+                return results
+
+            async def _create_index_for_node(
+                path: str, ds: Dataset
+            ) -> tuple[str, Dataset]:
+                return path, await _maybe_create_default_indexes_async(ds)
+
+            results = zarr_sync(create_indexes_async())
+            tree = DataTree.from_dict(results, name=backend_tree.name)
+        else:
+            tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
     else:
         tree = backend_tree
     if chunks is not None:
@@ -386,6 +417,36 @@ def _datatree_from_backend_datatree(
     return tree
 
 
+async def _maybe_create_default_indexes_async(ds: Dataset) -> Dataset:
+    """Create default indexes for dimension coordinates asynchronously.
+
+    This function parallelizes both data loading and index creation,
+    which can significantly speed up opening datasets with many coordinates.
+    """
+    import asyncio
+
+    to_index_names = [
+        name
+        for name, coord in ds.coords.items()
+        if coord.dims == (name,) and name not in ds.xindexes
+    ]
+
+    if not to_index_names:
+        return ds
+
+    async def load_var(var: Variable) -> Variable:
+        try:
+            return await var.load_async()
+        except NotImplementedError:
+            return await asyncio.to_thread(var.load)
+
+    await asyncio.gather(*[load_var(ds.variables[name]) for name in to_index_names])
+
+    variables = {name: ds.variables[name] for name in to_index_names}
+    new_coords = Coordinates(variables)
+    return ds.assign_coords(new_coords)
+
+
 def open_dataset(
     filename_or_obj: T_PathFileOrDataStore,
     *,
@@ -882,6 +943,7 @@ def open_datatree(
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
     backend_kwargs: dict[str, Any] | None = None,
+    max_concurrency: int | None = None,
     **kwargs,
 ) -> DataTree:
     """
@@ -1014,6 +1076,13 @@ def open_datatree(
         chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
         For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed
         to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
+    max_concurrency : int, optional
+        Maximum number of concurrent I/O operations when opening groups in
+        parallel. This limits the number of groups that are loaded simultaneously.
+        Useful for controlling resource usage with large datatrees or stores
+        that may have limitations on concurrent access (e.g., icechunk).
+        Only used by backends that support parallel loading (currently Zarr v3).
+        If None (default), the backend uses its default value (typically 10).
     backend_kwargs: dict
         Additional keyword arguments passed on to the engine open function,
         equivalent to `**kwargs`.
@@ -1074,6 +1143,9 @@ def open_datatree(
     )
     overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
 
+    if max_concurrency is not None:
+        kwargs["max_concurrency"] = max_concurrency
+
     backend_tree = backend.open_datatree(
         filename_or_obj,
         drop_variables=drop_variables,

diff --git a/xarray/backends/store.py b/xarray/backends/store.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 from collections.abc import Iterable
 from typing import TYPE_CHECKING
 
@@ -72,5 +73,37 @@ def open_dataset(
 
         return ds
 
+    async def open_dataset_async(
+        self,
+        filename_or_obj: T_PathFileOrDataStore,
+        *,
+        mask_and_scale=True,
+        decode_times=True,
+        concat_characters=True,
+        decode_coords=True,
+        drop_variables: str | Iterable[str] | None = None,
+        set_indexes: bool = True,
+        use_cftime=None,
+        decode_timedelta=None,
+    ) -> Dataset:
+        """Async version of open_dataset.
+
+        Offloads the entire open_dataset operation to a thread to avoid blocking
+        the event loop. This is necessary because decode_cf_variables can trigger
+        data reads (e.g., for time decoding) which may use synchronous I/O.
+        """
+        return await asyncio.to_thread(
+            self.open_dataset,
+            filename_or_obj,
+            mask_and_scale=mask_and_scale,
+            decode_times=decode_times,
+            concat_characters=concat_characters,
+            decode_coords=decode_coords,
+            drop_variables=drop_variables,
+            set_indexes=set_indexes,
+            use_cftime=use_cftime,
+            decode_timedelta=decode_timedelta,
+        )
+
 
 BACKEND_ENTRYPOINTS["store"] = (None, StoreBackendEntrypoint)