Fix loc with slice when the index is monotonically decreasing. (#1179)

databricks · Jan 9, 2020 · 91f4a6b · 91f4a6b
1 parent 68c602a
commit 91f4a6b
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 36 deletions.
diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py
@@ -437,14 +437,13 @@ def _select_rows(self, rows_sel):
                 index_data_type = index_column.spark_type
                 start = rows_sel.start
                 stop = rows_sel.stop
-                start_order_column = sdf[NATURAL_ORDER_COLUMN_NAME]
-                stop_order_column = sdf[NATURAL_ORDER_COLUMN_NAME]
 
                 # get natural order from '__natural_order__' from start to stop
                 # to keep natural order.
                 start_and_stop = (
                     sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME)
-                       .where((index_column._scol == start) | (index_column._scol == stop))
+                       .where((index_column._scol == F.lit(start).cast(index_data_type))
+                              | (index_column._scol == F.lit(stop).cast(index_data_type)))
                        .collect())
 
                 start = [row[1] for row in start_and_stop if row[0] == start]
@@ -453,44 +452,37 @@ def _select_rows(self, rows_sel):
                 stop = [row[1] for row in start_and_stop if row[0] == stop]
                 stop = stop[-1] if len(stop) > 0 else None
 
-                # Assume we use the natural order by default.
-                start_order_column_type = LongType()
-                stop_order_column_type = LongType()
+                cond = []
+                if start is not None:
+                    cond.append(F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(LongType()))
+                if stop is not None:
+                    cond.append(F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(LongType()))
 
                 # if index order is not monotonic increasing or decreasing
                 # and specified values don't exist in index, raise KeyError
                 if ((start is None and rows_sel.start is not None)
                         or (stop is None and rows_sel.stop is not None)):
-                    is_monotonic = sdf.select(
+                    inc, dec = sdf.select(
                         index_column._is_monotonic()._scol.alias('__increasing__'),
                         index_column._is_monotonic_decreasing()._scol.alias('__decreasing__')) \
-                        .select(F.min(F.coalesce('__increasing__', F.lit(True)))
-                                | F.min(F.coalesce('__decreasing__', F.lit(True)))).first()[0]
+                        .select(F.min(F.coalesce('__increasing__', F.lit(True))),
+                                F.min(F.coalesce('__decreasing__', F.lit(True)))).first()
                     if start is None and rows_sel.start is not None:
-                        if is_monotonic is False:
-                            raise KeyError(rows_sel.start)
+                        start = rows_sel.start
+                        if inc is not False:
+                            cond.append(index_column._scol >= F.lit(start).cast(index_data_type))
+                        elif dec is not False:
+                            cond.append(index_column._scol <= F.lit(start).cast(index_data_type))
                         else:
-                            start = rows_sel.start
-                            start_order_column = index_column._scol
-                            start_order_column_type = index_data_type
+                            raise KeyError(rows_sel.start)
                     if stop is None and rows_sel.stop is not None:
-                        if is_monotonic is False:
-                            raise KeyError(rows_sel.stop)
+                        stop = rows_sel.stop
+                        if inc is not False:
+                            cond.append(index_column._scol <= F.lit(stop).cast(index_data_type))
+                        elif dec is not False:
+                            cond.append(index_column._scol >= F.lit(stop).cast(index_data_type))
                         else:
-                            stop = rows_sel.stop
-                            stop_order_column = index_column._scol
-                            stop_order_column_type = index_data_type
-
-                # if start and stop are same, just get all start(or stop) values
-                if start == stop:
-                    return (index_column._scol == F.lit(rows_sel.start).cast(index_data_type),
-                            None, None)
-
-                cond = []
-                if start is not None:
-                    cond.append(start_order_column >= F.lit(start).cast(start_order_column_type))
-                if stop is not None:
-                    cond.append(stop_order_column <= F.lit(stop).cast(stop_order_column_type))
+                            raise KeyError(rows_sel.stop)
 
                 if len(cond) > 0:
                     return reduce(lambda x, y: x & y, cond), None, None

diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py
@@ -231,19 +231,39 @@ def test_loc(self):
         self.assertRaises(KeyError, lambda: kdf.loc[10])
         self.assertRaises(KeyError, lambda: kdf.a.loc[10])
 
-        # duplicated index test
+        # monotonically increasing index test
         pdf = pd.DataFrame(
-            [1, 2, 3, 4, 5, 6, 7, 8, 9],
-            index=[0, 1, 1, 2, 2, 2, 3, 4, 5])
+            {'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
+            index=[0, 1, 1, 2, 2, 2, 4, 5, 6])
         kdf = ks.from_pandas(pdf)
 
-        self.assert_eq(repr(kdf.loc[:2]), repr(pdf.loc[:2]))
+        self.assert_eq(kdf.loc[:2], pdf.loc[:2])
+        self.assert_eq(kdf.loc[:3], pdf.loc[:3])
+        self.assert_eq(kdf.loc[3:], pdf.loc[3:])
+        self.assert_eq(kdf.loc[4:], pdf.loc[4:])
+        self.assert_eq(kdf.loc[3:2], pdf.loc[3:2])
+        self.assert_eq(kdf.loc[-1:2], pdf.loc[-1:2])
+        self.assert_eq(kdf.loc[3:10], pdf.loc[3:10])
+
+        # monotonically decreasing index test
+        pdf = pd.DataFrame(
+            {'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
+            index=[6, 5, 5, 4, 4, 4, 2, 1, 0])
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(kdf.loc[:4], pdf.loc[:4])
+        self.assert_eq(kdf.loc[:3], pdf.loc[:3])
+        self.assert_eq(kdf.loc[3:], pdf.loc[3:])
+        self.assert_eq(kdf.loc[2:], pdf.loc[2:])
+        self.assert_eq(kdf.loc[2:3], pdf.loc[2:3])
+        self.assert_eq(kdf.loc[2:-1], pdf.loc[2:-1])
+        self.assert_eq(kdf.loc[10:3], pdf.loc[10:3])
 
         # test when type of key is string and given value is not included in key
-        pdf = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']).loc['a':'z']
+        pdf = pd.DataFrame({'a': [1, 2, 3]}, index=['a', 'b', 'd'])
         kdf = ks.from_pandas(pdf)
 
-        self.assert_eq(repr(kdf.loc['a':'z']), repr(pdf.loc['a':'z']))
+        self.assert_eq(kdf.loc['a':'z'], pdf.loc['a':'z'])
 
         # KeyError when index is not monotonic increasing or decreasing
         # and specified values don't exist in index