Skip to content

Commit

Permalink
Implemented GroupBy.median() (#1957)
Browse files Browse the repository at this point in the history
This PR proposes `GroupBy.median()`.

Note: the result can be slightly different from pandas since we use an approximated median based upon approximate percentile computation because computing median across a large dataset is extremely expensive.

```python
>>> kdf = ks.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],
...                     'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],
...                     'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},
...                    columns=['a', 'b', 'c'],
...                    index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])
>>> kdf
      a     b    c
7   1.0   2.0  3.0
2   1.0   3.0  5.0
4   1.0   1.0  2.0
1   1.0   4.0  5.0
3   2.0   6.0  1.0
4   2.0   9.0  2.0
9   2.0   8.0  6.0
10  3.0  10.0  4.0
5   3.0   7.0  3.0
6   3.0   5.0  6.0

>>> kdf.groupby('a').median().sort_index()  # doctest: +NORMALIZE_WHITESPACE
       b    c
a
1.0  2.0  3.0
2.0  8.0  2.0
3.0  7.0  4.0

>>> kdf.groupby('a')['b'].median().sort_index()
a
1.0    2.0
2.0    8.0
3.0    7.0
Name: b, dtype: float64
```

ref #1929
  • Loading branch information
itholic authored Dec 11, 2020
1 parent bb31489 commit 78b1004
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 2 deletions.
68 changes: 68 additions & 0 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
from databricks.koalas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
from databricks.koalas.window import RollingGroupby, ExpandingGroupby
from databricks.koalas.exceptions import DataError
from databricks.koalas.spark import functions as SF

# to keep it the same as pandas
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
Expand Down Expand Up @@ -2343,6 +2344,73 @@ def get_group(self, name) -> Union[DataFrame, Series]:

return DataFrame(internal)

def median(self, numeric_only=True, accuracy=10000) -> Union[DataFrame, Series]:
"""
Compute median of groups, excluding missing values.
For multiple groupings, the result index will be a MultiIndex
.. note:: Unlike pandas', the median in Koalas is an approximated median based upon
approximate percentile computation because computing median across a large dataset
is extremely expensive.
Parameters
----------
numeric_only : bool, default True
Include only float, int, boolean columns. False is not supported. This parameter
is mainly for pandas compatibility.
Returns
-------
Series or DataFrame
Median of values within each group.
Examples
--------
>>> kdf = ks.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],
... 'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],
... 'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},
... columns=['a', 'b', 'c'],
... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])
>>> kdf
a b c
7 1.0 2.0 3.0
2 1.0 3.0 5.0
4 1.0 1.0 2.0
1 1.0 4.0 5.0
3 2.0 6.0 1.0
4 2.0 9.0 2.0
9 2.0 8.0 6.0
10 3.0 10.0 4.0
5 3.0 7.0 3.0
6 3.0 5.0 6.0
DataFrameGroupBy
>>> kdf.groupby('a').median().sort_index() # doctest: +NORMALIZE_WHITESPACE
b c
a
1.0 2.0 3.0
2.0 8.0 2.0
3.0 7.0 4.0
SeriesGroupBy
>>> kdf.groupby('a')['b'].median().sort_index()
a
1.0 2.0
2.0 8.0
3.0 7.0
Name: b, dtype: float64
"""
if not isinstance(accuracy, int):
raise ValueError(
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
)

stat_function = lambda col: SF.percentile_approx(col, 0.5, accuracy)
return self._reduce_for_stat_function(stat_function, only_numeric=numeric_only)

def _reduce_for_stat_function(self, sfun, only_numeric):
agg_columns = self._agg_columns
agg_columns_scols = self._agg_columns_scols
Expand Down
2 changes: 0 additions & 2 deletions databricks/koalas/missing/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ class MissingPandasLikeDataFrameGroupBy(object):

# Functions
boxplot = _unsupported_function("boxplot")
median = _unsupported_function("median")
ngroup = _unsupported_function("ngroup")
nth = _unsupported_function("nth")
ohlc = _unsupported_function("ohlc")
Expand Down Expand Up @@ -93,7 +92,6 @@ class MissingPandasLikeSeriesGroupBy(object):
agg = _unsupported_function("agg")
aggregate = _unsupported_function("aggregate")
describe = _unsupported_function("describe")
median = _unsupported_function("median")
ngroup = _unsupported_function("ngroup")
nth = _unsupported_function("nth")
ohlc = _unsupported_function("ohlc")
Expand Down
24 changes: 24 additions & 0 deletions databricks/koalas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2609,6 +2609,30 @@ def test_get_group(self):
ValueError, lambda: kdf.groupby([("B", "class"), ("A", "name")]).get_group("mammal")
)

def test_median(self):
kdf = ks.DataFrame(
{
"a": [1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"b": [2.0, 3.0, 1.0, 4.0, 6.0, 9.0, 8.0, 10.0, 7.0, 5.0],
"c": [3.0, 5.0, 2.0, 5.0, 1.0, 2.0, 6.0, 4.0, 3.0, 6.0],
},
columns=["a", "b", "c"],
index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6],
)
# DataFrame
expected_result = ks.DataFrame(
{"b": [2.0, 8.0, 7.0], "c": [3.0, 2.0, 4.0]}, index=pd.Index([1.0, 2.0, 3.0], name="a")
)
self.assert_eq(expected_result, kdf.groupby("a").median().sort_index())
# Series
expected_result = ks.Series(
[2.0, 8.0, 7.0], name="b", index=pd.Index([1.0, 2.0, 3.0], name="a")
)
self.assert_eq(expected_result, kdf.groupby("a")["b"].median().sort_index())

with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
kdf.groupby("a").median(accuracy="a")

def test_tail(self):
pdf = pd.DataFrame(
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Computations / Descriptive Stats
GroupBy.last
GroupBy.max
GroupBy.mean
GroupBy.median
GroupBy.min
GroupBy.rank
GroupBy.std
Expand Down

0 comments on commit 78b1004

Please sign in to comment.