Skip to content

Commit

Permalink
Implement DataFrame.first and Series.first functionality (#2128)
Browse files Browse the repository at this point in the history
Please see change to implement DataFrame.first and Series.first functionality similar to that available in pandas. Requirement raised in issue: #1929

```python
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
dtype: int64

>>> ks_series.first('3D')
2018-04-09  1
2018-04-11  2
dtype: int64
```
  • Loading branch information
awdavidson authored Mar 31, 2021
1 parent 07c4e36 commit 0565e14
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 18 deletions.
67 changes: 58 additions & 9 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3077,14 +3077,12 @@ def between_time(
2018-04-09 00:00:00 1
2018-04-12 01:00:00 4
"""
from databricks.koalas.indexes import DatetimeIndex

axis = validate_axis(axis)

if axis != 0:
raise NotImplementedError("between_time currently only works for axis=0")

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("Index must be DatetimeIndex")

kdf = self.copy()
Expand Down Expand Up @@ -3150,8 +3148,6 @@ def at_time(
2018-04-09 12:00:00 2
2018-04-10 12:00:00 4
"""
from databricks.koalas.indexes import DatetimeIndex

if asof:
raise NotImplementedError("'asof' argument is not supported")

Expand All @@ -3160,7 +3156,7 @@ def at_time(
if axis != 0:
raise NotImplementedError("at_time currently only works for axis=0")

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("Index must be DatetimeIndex")

kdf = self.copy()
Expand Down Expand Up @@ -5801,16 +5797,69 @@ def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
not returned.
"""
# Check index type should be format DateTime
from databricks.koalas.indexes import DatetimeIndex

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("'last' only supports a DatetimeIndex")

offset = to_offset(offset)
from_date = self.index.max() - offset

return cast(DataFrame, self.loc[from_date:])

def first(self, offset: Union[str, DateOffset]) -> "DataFrame":
"""
Select first periods of time series data based on a date offset.
When having a DataFrame with dates as index, this function can
select the first few rows based on a date offset.
Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the first 3 days.
Returns
-------
DataFrame
A subset of the caller.
Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
>>> kdf
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
Get the rows for the last 3 days:
>>> kdf.first('3D')
A
2018-04-09 1
2018-04-11 2
Notice the data for 3 first calendar days were returned, not the first
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
# Check index type should be format DatetimeIndex
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("'first' only supports a DatetimeIndex")

offset = to_offset(offset)
to_date = self.index.min() + offset

return cast(DataFrame, self.loc[:to_date])

def pivot_table(
self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
) -> "DataFrame":
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class _MissingPandasLikeDataFrame(object):
corrwith = _unsupported_function("corrwith")
cov = _unsupported_function("cov")
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
lookup = _unsupported_function("lookup")
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class MissingPandasLikeSeries(object):
convert_dtypes = _unsupported_function("convert_dtypes")
cov = _unsupported_function("cov")
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
reorder_levels = _unsupported_function("reorder_levels")
Expand Down
53 changes: 50 additions & 3 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2246,8 +2246,8 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
>>> ks_series
>>> kser = ks.Series([1, 2, 3, 4], index=index)
>>> kser
2018-04-09 1
2018-04-11 2
2018-04-13 3
Expand All @@ -2256,7 +2256,7 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
Get the rows for the last 3 days:
>>> ks_series.last('3D')
>>> kser.last('3D')
2018-04-13 3
2018-04-15 4
dtype: int64
Expand All @@ -2267,6 +2267,53 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
"""
return first_series(self.to_frame().last(offset)).rename(self.name)

def first(self, offset: Union[str, DateOffset]) -> "Series":
"""
Select first periods of time series data based on a date offset.
When having a Series with dates as index, this function can
select the first few elements based on a date offset.
Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the first 3 days.
Returns
-------
Series
A subset of the caller.
Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> kser = ks.Series([1, 2, 3, 4], index=index)
>>> kser
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
dtype: int64
Get the rows for the first 3 days:
>>> kser.first('3D')
2018-04-09 1
2018-04-11 2
dtype: int64
Notice the data for 3 first calendar days were returned, not the first
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
return first_series(self.to_frame().first(offset)).rename(self.name)

# TODO: Categorical type isn't supported (due to PySpark's limitation) and
# some doctests related with timestamps were not added.
def unique(self) -> "Series":
Expand Down
9 changes: 9 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5212,6 +5212,15 @@ def test_last(self):
with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
ks.DataFrame([1, 2, 3, 4]).last("1D")

def test_first(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
kdf = ks.from_pandas(pdf)
self.assert_eq(pdf.first("1D"), kdf.first("1D"))
self.assert_eq(pdf.first(DateOffset(days=1)), kdf.first(DateOffset(days=1)))
with self.assertRaisesRegex(TypeError, "'first' only supports a DatetimeIndex"):
ks.DataFrame([1, 2, 3, 4]).first("1D")

def test_first_valid_index(self):
pdf = pd.DataFrame(
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
Expand Down
18 changes: 14 additions & 4 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,22 @@ def test_head(self):
self.assert_eq(kser.head(-10), pser.head(-10))

def test_last(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pd_input = pd.Series([1, 2, 3, 4], index=index)
ks_input = ks.Series([1, 2, 3, 4], index=index)
with self.assertRaises(TypeError):
self.kser.last("1D")
self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))

index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
kser = ks.from_pandas(pser)
self.assert_eq(kser.last("1D"), pser.last("1D"))

def test_first(self):
with self.assertRaises(TypeError):
self.kser.first("1D")

index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
kser = ks.from_pandas(pser)
self.assert_eq(kser.first("1D"), pser.first("1D"))

def test_rename(self):
pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
DataFrame.duplicated
DataFrame.equals
DataFrame.filter
DataFrame.first
DataFrame.head
DataFrame.last
DataFrame.rename
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ Reindexing / Selection / Label manipulation
Series.equals
Series.add_prefix
Series.add_suffix
Series.first
Series.head
Series.idxmax
Series.idxmin
Expand Down

0 comments on commit 0565e14

Please sign in to comment.