Data Explorer: Implement simplified search_schema API that can also sort column names in ascending / descending order (#8810)

wesm · web-flow · commit 3c3ebf889415 · 2025-08-04T12:47:15.000-05:00
Addresses #8804, updating the search_schema data explorer API to only return column indices, removing pagination (which was a nice idea, but until we actually have a demonstrated performance problem just adds complexity), and adding sorting capability.
diff --git a/extensions/positron-python/python_files/posit/positron/connections.py b/extensions/positron-python/python_files/posit/positron/connections.py
@@ -151,7 +151,7 @@ def get_metadata(self) -> MetadataSchema:
         - code: The code used to recreate the connection.
         """
         return MetadataSchema(
-            name=self.display_name,
+            name=self.display_name or "Unnamed Connection",
             language_id="python",
             host=self.host,
             type=self.type,
diff --git a/extensions/positron-python/python_files/posit/positron/data_explorer.py b/extensions/positron-python/python_files/posit/positron/data_explorer.py
@@ -79,6 +79,7 @@
     SearchSchemaFeatures,
     SearchSchemaParams,
     SearchSchemaResult,
+    SearchSchemaSortOrder,
     SetColumnFiltersFeatures,
     SetColumnFiltersParams,
     SetRowFiltersFeatures,
@@ -294,8 +295,7 @@ def convert_to_code(self, request: ConvertToCodeRequest):
 
     def search_schema(self, params: SearchSchemaParams):
         filters = params.filters
-        start_index = params.start_index
-        max_results = params.max_results
+        sort_order = params.sort_order
         if self._search_schema_last_result is not None:
             last_filters, matches = self._search_schema_last_result
             if last_filters != filters:
@@ -305,11 +305,16 @@ def search_schema(self, params: SearchSchemaParams):
             matches = self._column_filter_get_matches(filters)
             self._search_schema_last_result = (filters, matches)
 
-        matches_slice = matches[start_index : start_index + max_results]
-        return SearchSchemaResult(
-            matches=TableSchema(columns=[self._get_single_column_schema(i) for i in matches_slice]),
-            total_num_matches=len(matches),
-        )
+        # Apply sorting based on sort_order
+        if sort_order == SearchSchemaSortOrder.Ascending:
+            # Sort by column name ascending
+            matches = sorted(matches, key=lambda idx: self._get_column_name(idx))
+        elif sort_order == SearchSchemaSortOrder.Descending:
+            # Sort by column name descending
+            matches = sorted(matches, key=lambda idx: self._get_column_name(idx), reverse=True)
+        # For SearchSchemaSortOrder.Original, keep original order (no sorting needed)
+
+        return SearchSchemaResult(matches=matches)
 
     def _column_filter_get_matches(self, filters: list[ColumnFilter]):
         matchers = self._get_column_filter_functions(filters)
@@ -341,8 +346,8 @@ def matcher(index):
 
         def _match_display_types(params: FilterMatchDataTypes):
             def matcher(index):
-                schema = self._get_single_column_schema(index)
-                return schema.type_display in params.display_types
+                type_display = self._get_column_type_display(index)
+                return type_display in params.display_types
 
             return matcher
 
@@ -362,6 +367,9 @@ def matcher(index):
     def _get_column_name(self, column_index: int) -> str:
         raise NotImplementedError
 
+    def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
+        raise NotImplementedError
+
     def get_data_values(self, params: GetDataValuesParams):
         self._recompute_if_needed()
         return self._get_data_values(
@@ -1421,7 +1429,10 @@ def _get_type(cls, column, column_index, state: DataExplorerState):
                 categories_type_name = cls._get_inferred_dtype(
                     dtype.categories, column_index, state
                 )
-                type_display = cls.TYPE_NAME_MAPPING.get(categories_type_name, categories_type_name)
+                categories_type_name = cls.TYPE_NAME_MAPPING.get(
+                    categories_type_name, categories_type_name
+                )
+                type_display = cls._get_type_display(categories_type_name)
             else:
                 categories_type_name = str(dtype.categories.dtype)
                 type_display = cls._get_type_display(categories_type_name)
@@ -1519,6 +1530,11 @@ def _get_single_column_schema(self, column_index: int):
     def _get_column_name(self, index: int):
         return str(self.table.columns[index])
 
+    def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
+        column = self.table.iloc[:, column_index]
+        _, type_display = self._get_type(column, column_index, self.state)
+        return type_display
+
     def _get_data_values(
         self,
         selections: list[ColumnSelection],
@@ -2346,6 +2362,10 @@ def _get_single_column_schema(self, column_index: int):
     def _get_column_name(self, column_index: int) -> str:
         return self.table[:, column_index].name
 
+    def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
+        column = self.table[:, column_index]
+        return self._get_type_display(column.dtype)
+
     @classmethod
     def _construct_schema(
         cls,
@@ -2395,7 +2415,7 @@ def _construct_schema(
             "Object": "object",
             "List": "array",
             "Struct": "struct",
-            "Categorical": "categorical",
+            "Categorical": "string",
             "Enum": "unknown",
             "Null": "unknown",  # Not yet implemented
             "Unknown": "unknown",
@@ -2405,7 +2425,8 @@ def _construct_schema(
     @classmethod
     def _get_type_display(cls, dtype: pl.DataType):
         key = str(dtype.base_type())
-        return cls.TYPE_DISPLAY_MAPPING.get(key, "unknown")
+        type_display = cls.TYPE_DISPLAY_MAPPING.get(key, "unknown")
+        return ColumnDisplayType(type_display)
 
     def _search_schema(
         self, filters: list[ColumnFilter], start_index: int, max_results: int
diff --git a/extensions/positron-python/python_files/posit/positron/data_explorer_comm.py b/extensions/positron-python/python_files/posit/positron/data_explorer_comm.py
@@ -18,6 +18,19 @@
 from ._vendor.pydantic import BaseModel, Field, StrictBool, StrictFloat, StrictInt, StrictStr
 
 
+@enum.unique
+class SearchSchemaSortOrder(str, enum.Enum):
+    """
+    Possible values for SortOrder in SearchSchema
+    """
+
+    Original = "original"
+
+    Ascending = "ascending"
+
+    Descending = "descending"
+
+
 @enum.unique
 class ColumnDisplayType(str, enum.Enum):
     """
@@ -229,12 +242,8 @@ class SearchSchemaResult(BaseModel):
     Result in Methods
     """
 
-    matches: TableSchema = Field(
-        description="A schema containing matching columns up to the max_results limit",
-    )
-
-    total_num_matches: StrictInt = Field(
-        description="The total number of columns matching the filter",
+    matches: List[StrictInt] = Field(
+        description="The column indices of the matching column indices in the indicated sort order",
     )
 
 
@@ -1235,7 +1244,7 @@ class DataExplorerBackendRequest(str, enum.Enum):
     # Request schema
     GetSchema = "get_schema"
 
-    # Search full, unfiltered table schema with column filters
+    # Search table schema with column filters, optionally sort results
     SearchSchema = "search_schema"
 
     # Request formatted values from table columns
@@ -1329,27 +1338,21 @@ class GetSchemaRequest(BaseModel):
 
 class SearchSchemaParams(BaseModel):
     """
-    Search full, unfiltered table schema for column names matching one or
-    more column filters
+    Search table schema with column filters, optionally sort results
     """
 
     filters: List[ColumnFilter] = Field(
-        description="Column filters to apply when searching",
-    )
-
-    start_index: StrictInt = Field(
-        description="Index (starting from zero) of first result to fetch (for paging)",
+        description="Column filters to apply when searching, can be empty",
     )
 
-    max_results: StrictInt = Field(
-        description="Maximum number of resulting column schemas to fetch from the start index",
+    sort_order: SearchSchemaSortOrder = Field(
+        description="How to sort results: original in-schema order, alphabetical ascending or descending",
     )
 
 
 class SearchSchemaRequest(BaseModel):
     """
-    Search full, unfiltered table schema for column names matching one or
-    more column filters
+    Search table schema with column filters, optionally sort results
     """
 
     params: SearchSchemaParams = Field(
diff --git a/extensions/positron-python/python_files/posit/positron/tests/test_data_explorer.py b/extensions/positron-python/python_files/posit/positron/tests/test_data_explorer.py
@@ -394,13 +394,12 @@ def get_schema(self, table_name, column_indices=None):
             column_indices=column_indices,
         )["columns"]
 
-    def search_schema(self, table_name, filters, start_index, max_results):
+    def search_schema(self, table_name, filters, sort_order="original"):
         return self.do_json_rpc(
             table_name,
             "search_schema",
             filters=filters,
-            start_index=start_index,
-            max_results=max_results,
+            sort_order=sort_order,
         )
 
     def get_state(self, table_name):
@@ -907,11 +906,11 @@ def _match_types_filter(data_types):
 def test_search_schema(dxf: DataExplorerFixture):
     # Test search_schema RPC for pandas and polars
 
-    # Make a few thousand column names we can search for
+    # Make a smaller set of column names for easier testing
     column_names = [
         f"{prefix}_{i}"
-        for prefix in ["aaa", "bbb", "ccc", "ddd"]
-        for i in range({"aaa": 1000, "bbb": 100, "ccc": 50, "ddd": 10}[prefix])
+        for prefix in ["apple", "banana", "cherry", "date"]
+        for i in range({"apple": 10, "banana": 5, "cherry": 3, "date": 2}[prefix])
     ]
 
     data_examples = {
@@ -939,54 +938,94 @@ def test_search_schema(dxf: DataExplorerFixture):
     dxf.register_table("test_df", test_df)
     dxf.register_table("dfp", dfp)
 
-    aaa_filter = _text_search_filter("aaa")
-    bbb_filter = _text_search_filter("bbb")
-    ccc_filter = _text_search_filter("ccc")
-    ddd_filter = _text_search_filter("ddd")
+    apple_filter = _text_search_filter("apple")
+    banana_filter = _text_search_filter("banana")
 
     for name in ["test_df", "dfp"]:
-        full_schema = dxf.get_schema(name, list(range(len(column_names))))
+        # Test filtering by text
+        result = dxf.search_schema(name, [apple_filter])
+        expected_apple_indices = [i for i, col in enumerate(column_names) if "apple" in col]
+        assert result["matches"] == expected_apple_indices
+
+        result = dxf.search_schema(name, [banana_filter])
+        expected_banana_indices = [i for i, col in enumerate(column_names) if "banana" in col]
+        assert result["matches"] == expected_banana_indices
+
+        # Test filtering by data type
+        string_filter = _match_types_filter([ColumnDisplayType.String])
+        result = dxf.search_schema(name, [string_filter])
+        # String columns should be at indices 1, 6, 11, 16 (every 5th starting from 1)
+        expected_string_indices = [i for i in range(len(column_names)) if i % 5 == 1]
+        assert result["matches"] == expected_string_indices
+
+        # Test combining filters
+        result = dxf.search_schema(name, [apple_filter, string_filter])
+        # Apple columns that are also strings
+        expected_combined = [i for i in expected_apple_indices if i % 5 == 1]
+        assert result["matches"] == expected_combined
+
+        # Test sorting
+        result = dxf.search_schema(name, [], "original")
+        expected_all_indices = list(range(len(column_names)))
+        assert result["matches"] == expected_all_indices
+
+        result = dxf.search_schema(name, [], "ascending")
+        # Should be sorted by column name alphabetically
+        expected_sorted = sorted(range(len(column_names)), key=lambda i: column_names[i])
+        assert result["matches"] == expected_sorted
+
+        result = dxf.search_schema(name, [], "descending")
+        # Should be sorted by column name reverse alphabetically
+        expected_reverse_sorted = sorted(
+            range(len(column_names)), key=lambda i: column_names[i], reverse=True
+        )
+        assert result["matches"] == expected_reverse_sorted
 
-        # (search_term, start_index, max_results, ex_total, ex_matches)
-        cases = [
-            ([aaa_filter], 0, 100, 1000, full_schema[:100]),
-            (
-                [aaa_filter, _match_types_filter([ColumnDisplayType.String])],
-                0,
-                100,
-                200,
-                full_schema[:500][1::5],
-            ),
-            (
-                [
-                    aaa_filter,
-                    _match_types_filter([ColumnDisplayType.Boolean, ColumnDisplayType.Number]),
-                ],
-                0,
-                120,
-                600,
-                [x for i, x in enumerate(full_schema[:200]) if i % 5 in (0, 2, 3)],
-            ),
-            ([aaa_filter], 100, 100, 1000, full_schema[100:200]),
-            ([aaa_filter], 950, 100, 1000, full_schema[950:1000]),
-            ([aaa_filter], 1000, 100, 1000, []),
-            ([bbb_filter], 0, 10, 100, full_schema[1000:1010]),
-            ([ccc_filter], 0, 10, 50, full_schema[1100:1110]),
-            ([ddd_filter], 0, 10, 10, full_schema[1150:1160]),
-        ]
 
-        for (
-            filters,
-            start_index,
-            max_results,
-            ex_total,
-            ex_matches,
-        ) in cases:
-            result = dxf.search_schema(name, filters, start_index, max_results)
-
-            assert result["total_num_matches"] == ex_total
-            matches = result["matches"]["columns"]
-            assert matches == ex_matches
+def test_search_schema_sort_by_name(dxf: DataExplorerFixture):
+    # Test comprehensive sort-by-name functionality
+
+    # Create a dataframe with deliberately mixed-case and varied column names
+    column_names = ["Zebra", "apple", "BANANA", "Cherry", "date", "Elephant", "fig"]
+    data = {name: [1, 2, 3, 4, 5] for name in column_names}
+
+    test_df = pd.DataFrame(data)
+    dfp = pl.DataFrame(data)
+
+    dxf.register_table("sort_test_df", test_df)
+    dxf.register_table("sort_test_dfp", dfp)
+
+    for name in ["sort_test_df", "sort_test_dfp"]:
+        # Test original order (should be same as column order)
+        result = dxf.search_schema(name, [], "original")
+        expected_original = list(range(len(column_names)))
+        assert result["matches"] == expected_original
+
+        # Test ascending sort (case-sensitive alphabetical)
+        result = dxf.search_schema(name, [], "ascending")
+        expected_ascending = sorted(range(len(column_names)), key=lambda i: column_names[i])
+        assert result["matches"] == expected_ascending
+
+        # Test descending sort
+        result = dxf.search_schema(name, [], "descending")
+        expected_descending = sorted(
+            range(len(column_names)), key=lambda i: column_names[i], reverse=True
+        )
+        assert result["matches"] == expected_descending
+
+        # Test that sorting works with filters too
+        filter_with_a = _text_search_filter("a")  # Should match "Zebra", "apple", "BANANA"
+
+        result = dxf.search_schema(name, [filter_with_a], "ascending")
+        filtered_indices = [i for i, col in enumerate(column_names) if "a" in col.lower()]
+        expected_filtered_ascending = sorted(filtered_indices, key=lambda i: column_names[i])
+        assert result["matches"] == expected_filtered_ascending
+
+        result = dxf.search_schema(name, [filter_with_a], "descending")
+        expected_filtered_descending = sorted(
+            filtered_indices, key=lambda i: column_names[i], reverse=True
+        )
+        assert result["matches"] == expected_filtered_descending
 
 
 def test_pandas_get_data_values(dxf: DataExplorerFixture):
diff --git a/positron/comms/data_explorer-backend-openrpc.json b/positron/comms/data_explorer-backend-openrpc.json
diff --git a/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts b/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts