Skip to content

Commit a00bc91

Browse files
authored
GroupBy(chunked-array) (#9522)
* GroupBy(chunked-array) Closes #757 Closes #2852 * Optimizations * Optimize multi-index construction * Add tests * Add whats-new * Raise errors * Add docstring * preserve attrs * Add test for #757 * Typing fixes * Handle multiple groupers * Backcompat * better backcompat * fix * Handle edge case * comment * type: ignore
1 parent 29654fc commit a00bc91

File tree

10 files changed

+441
-68
lines changed

10 files changed

+441
-68
lines changed

doc/user-guide/groupby.rst

+9
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,15 @@ is identical to
294294
ds.resample(time=TimeResampler("ME"))
295295
296296
297+
The :py:class:`groupers.UniqueGrouper` accepts an optional ``labels`` kwarg that is not present
298+
in :py:meth:`DataArray.groupby` or :py:meth:`Dataset.groupby`.
299+
Specifying ``labels`` is required when grouping by a lazy array type (e.g. dask or cubed).
300+
The ``labels`` are used to construct the output coordinate (say for a reduction), and aggregations
301+
will only be run over the specified labels.
302+
You may use ``labels`` to also specify the ordering of groups to be used during iteration.
303+
The order will be preserved in the output.
304+
305+
297306
.. _groupby.multiple:
298307

299308
Grouping by multiple variables

doc/whats-new.rst

+8-9
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,21 @@ New Features
2323
~~~~~~~~~~~~
2424
- Added :py:meth:`DataTree.persist` method (:issue:`9675`, :pull:`9682`).
2525
By `Sam Levang <https://github.com/slevang>`_.
26+
- Support lazy grouping by dask arrays, and allow specifying ordered groups with ``UniqueGrouper(labels=["a", "b", "c"])``
27+
(:issue:`2852`, :issue:`757`).
28+
By `Deepak Cherian <https://github.com/dcherian>`_.
2629

2730
Breaking changes
2831
~~~~~~~~~~~~~~~~
2932

3033

3134
Deprecations
3235
~~~~~~~~~~~~
33-
36+
- Grouping by a chunked array (e.g. dask or cubed) currently eagerly loads that variable in to
37+
memory. This behaviour is deprecated. If eager loading was intended, please load such arrays
38+
manually using ``.load()`` or ``.compute()``. Else pass ``eagerly_compute_group=False``, and
39+
provide expected group labels using the ``labels`` kwarg to a grouper object such as
40+
:py:class:`grouper.UniqueGrouper` or :py:class:`grouper.BinGrouper`.
3441

3542
Bug fixes
3643
~~~~~~~~~
@@ -94,14 +101,6 @@ New Features
94101
(:issue:`9427`, :pull: `9428`).
95102
By `Alfonso Ladino <https://github.com/aladinor>`_.
96103

97-
Breaking changes
98-
~~~~~~~~~~~~~~~~
99-
100-
101-
Deprecations
102-
~~~~~~~~~~~~
103-
104-
105104
Bug fixes
106105
~~~~~~~~~
107106

xarray/core/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,7 @@ def _resample(
10941094
f"Received {type(freq)} instead."
10951095
)
10961096

1097-
rgrouper = ResolvedGrouper(grouper, group, self)
1097+
rgrouper = ResolvedGrouper(grouper, group, self, eagerly_compute_group=False)
10981098

10991099
return resample_cls(
11001100
self,

xarray/core/dataarray.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -6748,6 +6748,7 @@ def groupby(
67486748
*,
67496749
squeeze: Literal[False] = False,
67506750
restore_coord_dims: bool = False,
6751+
eagerly_compute_group: bool = True,
67516752
**groupers: Grouper,
67526753
) -> DataArrayGroupBy:
67536754
"""Returns a DataArrayGroupBy object for performing grouped operations.
@@ -6763,6 +6764,11 @@ def groupby(
67636764
restore_coord_dims : bool, default: False
67646765
If True, also restore the dimension order of multi-dimensional
67656766
coordinates.
6767+
eagerly_compute_group: bool
6768+
Whether to eagerly compute ``group`` when it is a chunked array.
6769+
This option is to maintain backwards compatibility. Set to False
6770+
to opt-in to future behaviour, where ``group`` is not automatically loaded
6771+
into memory.
67666772
**groupers : Mapping of str to Grouper or Resampler
67676773
Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object.
67686774
One of ``group`` or ``groupers`` must be provided.
@@ -6877,7 +6883,9 @@ def groupby(
68776883
)
68786884

68796885
_validate_groupby_squeeze(squeeze)
6880-
rgroupers = _parse_group_and_groupers(self, group, groupers)
6886+
rgroupers = _parse_group_and_groupers(
6887+
self, group, groupers, eagerly_compute_group=eagerly_compute_group
6888+
)
68816889
return DataArrayGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
68826890

68836891
@_deprecate_positional_args("v2024.07.0")
@@ -6892,6 +6900,7 @@ def groupby_bins(
68926900
squeeze: Literal[False] = False,
68936901
restore_coord_dims: bool = False,
68946902
duplicates: Literal["raise", "drop"] = "raise",
6903+
eagerly_compute_group: bool = True,
68956904
) -> DataArrayGroupBy:
68966905
"""Returns a DataArrayGroupBy object for performing grouped operations.
68976906
@@ -6928,6 +6937,11 @@ def groupby_bins(
69286937
coordinates.
69296938
duplicates : {"raise", "drop"}, default: "raise"
69306939
If bin edges are not unique, raise ValueError or drop non-uniques.
6940+
eagerly_compute_group: bool
6941+
Whether to eagerly compute ``group`` when it is a chunked array.
6942+
This option is to maintain backwards compatibility. Set to False
6943+
to opt-in to future behaviour, where ``group`` is not automatically loaded
6944+
into memory.
69316945
69326946
Returns
69336947
-------
@@ -6965,7 +6979,9 @@ def groupby_bins(
69656979
precision=precision,
69666980
include_lowest=include_lowest,
69676981
)
6968-
rgrouper = ResolvedGrouper(grouper, group, self)
6982+
rgrouper = ResolvedGrouper(
6983+
grouper, group, self, eagerly_compute_group=eagerly_compute_group
6984+
)
69696985

69706986
return DataArrayGroupBy(
69716987
self,

xarray/core/dataset.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -10379,6 +10379,7 @@ def groupby(
1037910379
*,
1038010380
squeeze: Literal[False] = False,
1038110381
restore_coord_dims: bool = False,
10382+
eagerly_compute_group: bool = True,
1038210383
**groupers: Grouper,
1038310384
) -> DatasetGroupBy:
1038410385
"""Returns a DatasetGroupBy object for performing grouped operations.
@@ -10394,6 +10395,11 @@ def groupby(
1039410395
restore_coord_dims : bool, default: False
1039510396
If True, also restore the dimension order of multi-dimensional
1039610397
coordinates.
10398+
eagerly_compute_group: bool
10399+
Whether to eagerly compute ``group`` when it is a chunked array.
10400+
This option is to maintain backwards compatibility. Set to False
10401+
to opt-in to future behaviour, where ``group`` is not automatically loaded
10402+
into memory.
1039710403
**groupers : Mapping of str to Grouper or Resampler
1039810404
Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object.
1039910405
One of ``group`` or ``groupers`` must be provided.
@@ -10476,7 +10482,9 @@ def groupby(
1047610482
)
1047710483

1047810484
_validate_groupby_squeeze(squeeze)
10479-
rgroupers = _parse_group_and_groupers(self, group, groupers)
10485+
rgroupers = _parse_group_and_groupers(
10486+
self, group, groupers, eagerly_compute_group=eagerly_compute_group
10487+
)
1048010488

1048110489
return DatasetGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
1048210490

@@ -10492,6 +10500,7 @@ def groupby_bins(
1049210500
squeeze: Literal[False] = False,
1049310501
restore_coord_dims: bool = False,
1049410502
duplicates: Literal["raise", "drop"] = "raise",
10503+
eagerly_compute_group: bool = True,
1049510504
) -> DatasetGroupBy:
1049610505
"""Returns a DatasetGroupBy object for performing grouped operations.
1049710506
@@ -10528,6 +10537,11 @@ def groupby_bins(
1052810537
coordinates.
1052910538
duplicates : {"raise", "drop"}, default: "raise"
1053010539
If bin edges are not unique, raise ValueError or drop non-uniques.
10540+
eagerly_compute_group: bool
10541+
Whether to eagerly compute ``group`` when it is a chunked array.
10542+
This option is to maintain backwards compatibility. Set to False
10543+
to opt-in to future behaviour, where ``group`` is not automatically loaded
10544+
into memory.
1053110545
1053210546
Returns
1053310547
-------
@@ -10565,7 +10579,9 @@ def groupby_bins(
1056510579
precision=precision,
1056610580
include_lowest=include_lowest,
1056710581
)
10568-
rgrouper = ResolvedGrouper(grouper, group, self)
10582+
rgrouper = ResolvedGrouper(
10583+
grouper, group, self, eagerly_compute_group=eagerly_compute_group
10584+
)
1056910585

1057010586
return DatasetGroupBy(
1057110587
self,

0 commit comments

Comments
 (0)