Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame.first and Series.first functionality #2128

Merged
merged 5 commits into from
Mar 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 58 additions & 9 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3077,14 +3077,12 @@ def between_time(
2018-04-09 00:00:00 1
2018-04-12 01:00:00 4
"""
from databricks.koalas.indexes import DatetimeIndex

axis = validate_axis(axis)

if axis != 0:
raise NotImplementedError("between_time currently only works for axis=0")

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
ueshin marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError("Index must be DatetimeIndex")

kdf = self.copy()
Expand Down Expand Up @@ -3150,8 +3148,6 @@ def at_time(
2018-04-09 12:00:00 2
2018-04-10 12:00:00 4
"""
from databricks.koalas.indexes import DatetimeIndex

if asof:
raise NotImplementedError("'asof' argument is not supported")

Expand All @@ -3160,7 +3156,7 @@ def at_time(
if axis != 0:
raise NotImplementedError("at_time currently only works for axis=0")

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("Index must be DatetimeIndex")

kdf = self.copy()
Expand Down Expand Up @@ -5801,16 +5797,69 @@ def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
not returned.
"""
# Check index type should be format DateTime
from databricks.koalas.indexes import DatetimeIndex

if not isinstance(self.index, DatetimeIndex):
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("'last' only supports a DatetimeIndex")

offset = to_offset(offset)
from_date = self.index.max() - offset

return cast(DataFrame, self.loc[from_date:])

def first(self, offset: Union[str, DateOffset]) -> "DataFrame":
"""
Select first periods of time series data based on a date offset.

When having a DataFrame with dates as index, this function can
select the first few rows based on a date offset.

Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the first 3 days.

Returns
-------
DataFrame
A subset of the caller.

Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`

Examples
--------

>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
>>> kdf
A
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4

Get the rows for the last 3 days:

>>> kdf.first('3D')
A
2018-04-09 1
2018-04-11 2

Notice the data for 3 first calendar days were returned, not the first
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
# Check index type should be format DatetimeIndex
if not isinstance(self.index, ks.DatetimeIndex):
raise TypeError("'first' only supports a DatetimeIndex")

offset = to_offset(offset)
to_date = self.index.min() + offset

return cast(DataFrame, self.loc[:to_date])

def pivot_table(
self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
) -> "DataFrame":
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class _MissingPandasLikeDataFrame(object):
corrwith = _unsupported_function("corrwith")
cov = _unsupported_function("cov")
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
lookup = _unsupported_function("lookup")
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class MissingPandasLikeSeries(object):
convert_dtypes = _unsupported_function("convert_dtypes")
cov = _unsupported_function("cov")
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
reorder_levels = _unsupported_function("reorder_levels")
Expand Down
53 changes: 50 additions & 3 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2245,8 +2245,8 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
>>> ks_series
>>> kser = ks.Series([1, 2, 3, 4], index=index)
>>> kser
2018-04-09 1
2018-04-11 2
2018-04-13 3
Expand All @@ -2255,7 +2255,7 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":

Get the rows for the last 3 days:

>>> ks_series.last('3D')
>>> kser.last('3D')
2018-04-13 3
2018-04-15 4
dtype: int64
Expand All @@ -2266,6 +2266,53 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
"""
return first_series(self.to_frame().last(offset)).rename(self.name)

def first(self, offset: Union[str, DateOffset]) -> "Series":
"""
Select first periods of time series data based on a date offset.

When having a Series with dates as index, this function can
select the first few elements based on a date offset.

Parameters
----------
offset : str or DateOffset
The offset length of the data that will be selected. For instance,
'3D' will display all the rows having their index within the first 3 days.

Returns
-------
Series
A subset of the caller.

Raises
------
TypeError
If the index is not a :class:`DatetimeIndex`

Examples
--------
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
>>> kser = ks.Series([1, 2, 3, 4], index=index)
>>> kser
2018-04-09 1
2018-04-11 2
2018-04-13 3
2018-04-15 4
dtype: int64

Get the rows for the first 3 days:

>>> kser.first('3D')
2018-04-09 1
2018-04-11 2
dtype: int64

Notice the data for 3 first calendar days were returned, not the first
3 observed days in the dataset, and therefore data for 2018-04-13 was
not returned.
"""
return first_series(self.to_frame().first(offset)).rename(self.name)

# TODO: Categorical type isn't supported (due to PySpark's limitation) and
# some doctests related with timestamps were not added.
def unique(self) -> "Series":
Expand Down
9 changes: 9 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5212,6 +5212,15 @@ def test_last(self):
with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
ks.DataFrame([1, 2, 3, 4]).last("1D")

def test_first(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
kdf = ks.from_pandas(pdf)
self.assert_eq(pdf.first("1D"), kdf.first("1D"))
self.assert_eq(pdf.first(DateOffset(days=1)), kdf.first(DateOffset(days=1)))
with self.assertRaisesRegex(TypeError, "'first' only supports a DatetimeIndex"):
ks.DataFrame([1, 2, 3, 4]).first("1D")

def test_first_valid_index(self):
pdf = pd.DataFrame(
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
Expand Down
18 changes: 14 additions & 4 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,22 @@ def test_head(self):
self.assert_eq(kser.head(-10), pser.head(-10))

def test_last(self):
index = pd.date_range("2018-04-09", periods=4, freq="2D")
pd_input = pd.Series([1, 2, 3, 4], index=index)
ks_input = ks.Series([1, 2, 3, 4], index=index)
with self.assertRaises(TypeError):
self.kser.last("1D")
self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))

index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
kser = ks.from_pandas(pser)
self.assert_eq(kser.last("1D"), pser.last("1D"))

def test_first(self):
with self.assertRaises(TypeError):
self.kser.first("1D")

index = pd.date_range("2018-04-09", periods=4, freq="2D")
pser = pd.Series([1, 2, 3, 4], index=index)
kser = ks.from_pandas(pser)
self.assert_eq(kser.first("1D"), pser.first("1D"))

def test_rename(self):
pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
DataFrame.duplicated
DataFrame.equals
DataFrame.filter
DataFrame.first
DataFrame.head
DataFrame.last
DataFrame.rename
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ Reindexing / Selection / Label manipulation
Series.equals
Series.add_prefix
Series.add_suffix
Series.first
Series.head
Series.idxmax
Series.idxmin
Expand Down