Skip to content

Commit

Permalink
Fix loc with slice when the index is monotonically decreasing. (#1179)
Browse files Browse the repository at this point in the history
  • Loading branch information
ueshin authored and HyukjinKwon committed Jan 9, 2020
1 parent 68c602a commit 91f4a6b
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 36 deletions.
52 changes: 22 additions & 30 deletions databricks/koalas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,14 +437,13 @@ def _select_rows(self, rows_sel):
index_data_type = index_column.spark_type
start = rows_sel.start
stop = rows_sel.stop
start_order_column = sdf[NATURAL_ORDER_COLUMN_NAME]
stop_order_column = sdf[NATURAL_ORDER_COLUMN_NAME]

# get natural order from '__natural_order__' from start to stop
# to keep natural order.
start_and_stop = (
sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME)
.where((index_column._scol == start) | (index_column._scol == stop))
.where((index_column._scol == F.lit(start).cast(index_data_type))
| (index_column._scol == F.lit(stop).cast(index_data_type)))
.collect())

start = [row[1] for row in start_and_stop if row[0] == start]
Expand All @@ -453,44 +452,37 @@ def _select_rows(self, rows_sel):
stop = [row[1] for row in start_and_stop if row[0] == stop]
stop = stop[-1] if len(stop) > 0 else None

# Assume we use the natural order by default.
start_order_column_type = LongType()
stop_order_column_type = LongType()
cond = []
if start is not None:
cond.append(F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(LongType()))
if stop is not None:
cond.append(F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(LongType()))

# if index order is not monotonic increasing or decreasing
# and specified values don't exist in index, raise KeyError
if ((start is None and rows_sel.start is not None)
or (stop is None and rows_sel.stop is not None)):
is_monotonic = sdf.select(
inc, dec = sdf.select(
index_column._is_monotonic()._scol.alias('__increasing__'),
index_column._is_monotonic_decreasing()._scol.alias('__decreasing__')) \
.select(F.min(F.coalesce('__increasing__', F.lit(True)))
| F.min(F.coalesce('__decreasing__', F.lit(True)))).first()[0]
.select(F.min(F.coalesce('__increasing__', F.lit(True))),
F.min(F.coalesce('__decreasing__', F.lit(True)))).first()
if start is None and rows_sel.start is not None:
if is_monotonic is False:
raise KeyError(rows_sel.start)
start = rows_sel.start
if inc is not False:
cond.append(index_column._scol >= F.lit(start).cast(index_data_type))
elif dec is not False:
cond.append(index_column._scol <= F.lit(start).cast(index_data_type))
else:
start = rows_sel.start
start_order_column = index_column._scol
start_order_column_type = index_data_type
raise KeyError(rows_sel.start)
if stop is None and rows_sel.stop is not None:
if is_monotonic is False:
raise KeyError(rows_sel.stop)
stop = rows_sel.stop
if inc is not False:
cond.append(index_column._scol <= F.lit(stop).cast(index_data_type))
elif dec is not False:
cond.append(index_column._scol >= F.lit(stop).cast(index_data_type))
else:
stop = rows_sel.stop
stop_order_column = index_column._scol
stop_order_column_type = index_data_type

# if start and stop are same, just get all start(or stop) values
if start == stop:
return (index_column._scol == F.lit(rows_sel.start).cast(index_data_type),
None, None)

cond = []
if start is not None:
cond.append(start_order_column >= F.lit(start).cast(start_order_column_type))
if stop is not None:
cond.append(stop_order_column <= F.lit(stop).cast(stop_order_column_type))
raise KeyError(rows_sel.stop)

if len(cond) > 0:
return reduce(lambda x, y: x & y, cond), None, None
Expand Down
32 changes: 26 additions & 6 deletions databricks/koalas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,39 @@ def test_loc(self):
self.assertRaises(KeyError, lambda: kdf.loc[10])
self.assertRaises(KeyError, lambda: kdf.a.loc[10])

# duplicated index test
# monotonically increasing index test
pdf = pd.DataFrame(
[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[0, 1, 1, 2, 2, 2, 3, 4, 5])
{'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
index=[0, 1, 1, 2, 2, 2, 4, 5, 6])
kdf = ks.from_pandas(pdf)

self.assert_eq(repr(kdf.loc[:2]), repr(pdf.loc[:2]))
self.assert_eq(kdf.loc[:2], pdf.loc[:2])
self.assert_eq(kdf.loc[:3], pdf.loc[:3])
self.assert_eq(kdf.loc[3:], pdf.loc[3:])
self.assert_eq(kdf.loc[4:], pdf.loc[4:])
self.assert_eq(kdf.loc[3:2], pdf.loc[3:2])
self.assert_eq(kdf.loc[-1:2], pdf.loc[-1:2])
self.assert_eq(kdf.loc[3:10], pdf.loc[3:10])

# monotonically decreasing index test
pdf = pd.DataFrame(
{'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
index=[6, 5, 5, 4, 4, 4, 2, 1, 0])
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.loc[:4], pdf.loc[:4])
self.assert_eq(kdf.loc[:3], pdf.loc[:3])
self.assert_eq(kdf.loc[3:], pdf.loc[3:])
self.assert_eq(kdf.loc[2:], pdf.loc[2:])
self.assert_eq(kdf.loc[2:3], pdf.loc[2:3])
self.assert_eq(kdf.loc[2:-1], pdf.loc[2:-1])
self.assert_eq(kdf.loc[10:3], pdf.loc[10:3])

# test when type of key is string and given value is not included in key
pdf = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']).loc['a':'z']
pdf = pd.DataFrame({'a': [1, 2, 3]}, index=['a', 'b', 'd'])
kdf = ks.from_pandas(pdf)

self.assert_eq(repr(kdf.loc['a':'z']), repr(pdf.loc['a':'z']))
self.assert_eq(kdf.loc['a':'z'], pdf.loc['a':'z'])

# KeyError when index is not monotonic increasing or decreasing
# and specified values don't exist in index
Expand Down

0 comments on commit 91f4a6b

Please sign in to comment.