Skip to content

Commit 3c3ebf8

Browse files
authored
Data Explorer: Implement simplified search_schema API that can also sort column names in ascending / descending order (#8810)
Addresses #8804, updating the search_schema data explorer API to only return column indices, removing pagination (which was a nice idea, but until we actually have a demonstrated performance problem just adds complexity), and adding sorting capability.
1 parent 94c64d2 commit 3c3ebf8

File tree

6 files changed

+184
-130
lines changed

6 files changed

+184
-130
lines changed

extensions/positron-python/python_files/posit/positron/connections.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def get_metadata(self) -> MetadataSchema:
151151
- code: The code used to recreate the connection.
152152
"""
153153
return MetadataSchema(
154-
name=self.display_name,
154+
name=self.display_name or "Unnamed Connection",
155155
language_id="python",
156156
host=self.host,
157157
type=self.type,

extensions/positron-python/python_files/posit/positron/data_explorer.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
SearchSchemaFeatures,
8080
SearchSchemaParams,
8181
SearchSchemaResult,
82+
SearchSchemaSortOrder,
8283
SetColumnFiltersFeatures,
8384
SetColumnFiltersParams,
8485
SetRowFiltersFeatures,
@@ -294,8 +295,7 @@ def convert_to_code(self, request: ConvertToCodeRequest):
294295

295296
def search_schema(self, params: SearchSchemaParams):
296297
filters = params.filters
297-
start_index = params.start_index
298-
max_results = params.max_results
298+
sort_order = params.sort_order
299299
if self._search_schema_last_result is not None:
300300
last_filters, matches = self._search_schema_last_result
301301
if last_filters != filters:
@@ -305,11 +305,16 @@ def search_schema(self, params: SearchSchemaParams):
305305
matches = self._column_filter_get_matches(filters)
306306
self._search_schema_last_result = (filters, matches)
307307

308-
matches_slice = matches[start_index : start_index + max_results]
309-
return SearchSchemaResult(
310-
matches=TableSchema(columns=[self._get_single_column_schema(i) for i in matches_slice]),
311-
total_num_matches=len(matches),
312-
)
308+
# Apply sorting based on sort_order
309+
if sort_order == SearchSchemaSortOrder.Ascending:
310+
# Sort by column name ascending
311+
matches = sorted(matches, key=lambda idx: self._get_column_name(idx))
312+
elif sort_order == SearchSchemaSortOrder.Descending:
313+
# Sort by column name descending
314+
matches = sorted(matches, key=lambda idx: self._get_column_name(idx), reverse=True)
315+
# For SearchSchemaSortOrder.Original, keep original order (no sorting needed)
316+
317+
return SearchSchemaResult(matches=matches)
313318

314319
def _column_filter_get_matches(self, filters: list[ColumnFilter]):
315320
matchers = self._get_column_filter_functions(filters)
@@ -341,8 +346,8 @@ def matcher(index):
341346

342347
def _match_display_types(params: FilterMatchDataTypes):
343348
def matcher(index):
344-
schema = self._get_single_column_schema(index)
345-
return schema.type_display in params.display_types
349+
type_display = self._get_column_type_display(index)
350+
return type_display in params.display_types
346351

347352
return matcher
348353

@@ -362,6 +367,9 @@ def matcher(index):
362367
def _get_column_name(self, column_index: int) -> str:
363368
raise NotImplementedError
364369

370+
def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
371+
raise NotImplementedError
372+
365373
def get_data_values(self, params: GetDataValuesParams):
366374
self._recompute_if_needed()
367375
return self._get_data_values(
@@ -1421,7 +1429,10 @@ def _get_type(cls, column, column_index, state: DataExplorerState):
14211429
categories_type_name = cls._get_inferred_dtype(
14221430
dtype.categories, column_index, state
14231431
)
1424-
type_display = cls.TYPE_NAME_MAPPING.get(categories_type_name, categories_type_name)
1432+
categories_type_name = cls.TYPE_NAME_MAPPING.get(
1433+
categories_type_name, categories_type_name
1434+
)
1435+
type_display = cls._get_type_display(categories_type_name)
14251436
else:
14261437
categories_type_name = str(dtype.categories.dtype)
14271438
type_display = cls._get_type_display(categories_type_name)
@@ -1519,6 +1530,11 @@ def _get_single_column_schema(self, column_index: int):
15191530
def _get_column_name(self, index: int):
15201531
return str(self.table.columns[index])
15211532

1533+
def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
1534+
column = self.table.iloc[:, column_index]
1535+
_, type_display = self._get_type(column, column_index, self.state)
1536+
return type_display
1537+
15221538
def _get_data_values(
15231539
self,
15241540
selections: list[ColumnSelection],
@@ -2346,6 +2362,10 @@ def _get_single_column_schema(self, column_index: int):
23462362
def _get_column_name(self, column_index: int) -> str:
23472363
return self.table[:, column_index].name
23482364

2365+
def _get_column_type_display(self, column_index: int) -> ColumnDisplayType:
2366+
column = self.table[:, column_index]
2367+
return self._get_type_display(column.dtype)
2368+
23492369
@classmethod
23502370
def _construct_schema(
23512371
cls,
@@ -2395,7 +2415,7 @@ def _construct_schema(
23952415
"Object": "object",
23962416
"List": "array",
23972417
"Struct": "struct",
2398-
"Categorical": "categorical",
2418+
"Categorical": "string",
23992419
"Enum": "unknown",
24002420
"Null": "unknown", # Not yet implemented
24012421
"Unknown": "unknown",
@@ -2405,7 +2425,8 @@ def _construct_schema(
24052425
@classmethod
24062426
def _get_type_display(cls, dtype: pl.DataType):
24072427
key = str(dtype.base_type())
2408-
return cls.TYPE_DISPLAY_MAPPING.get(key, "unknown")
2428+
type_display = cls.TYPE_DISPLAY_MAPPING.get(key, "unknown")
2429+
return ColumnDisplayType(type_display)
24092430

24102431
def _search_schema(
24112432
self, filters: list[ColumnFilter], start_index: int, max_results: int

extensions/positron-python/python_files/posit/positron/data_explorer_comm.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,19 @@
1818
from ._vendor.pydantic import BaseModel, Field, StrictBool, StrictFloat, StrictInt, StrictStr
1919

2020

21+
@enum.unique
22+
class SearchSchemaSortOrder(str, enum.Enum):
23+
"""
24+
Possible values for SortOrder in SearchSchema
25+
"""
26+
27+
Original = "original"
28+
29+
Ascending = "ascending"
30+
31+
Descending = "descending"
32+
33+
2134
@enum.unique
2235
class ColumnDisplayType(str, enum.Enum):
2336
"""
@@ -229,12 +242,8 @@ class SearchSchemaResult(BaseModel):
229242
Result in Methods
230243
"""
231244

232-
matches: TableSchema = Field(
233-
description="A schema containing matching columns up to the max_results limit",
234-
)
235-
236-
total_num_matches: StrictInt = Field(
237-
description="The total number of columns matching the filter",
245+
matches: List[StrictInt] = Field(
246+
description="The column indices of the matching column indices in the indicated sort order",
238247
)
239248

240249

@@ -1235,7 +1244,7 @@ class DataExplorerBackendRequest(str, enum.Enum):
12351244
# Request schema
12361245
GetSchema = "get_schema"
12371246

1238-
# Search full, unfiltered table schema with column filters
1247+
# Search table schema with column filters, optionally sort results
12391248
SearchSchema = "search_schema"
12401249

12411250
# Request formatted values from table columns
@@ -1329,27 +1338,21 @@ class GetSchemaRequest(BaseModel):
13291338

13301339
class SearchSchemaParams(BaseModel):
13311340
"""
1332-
Search full, unfiltered table schema for column names matching one or
1333-
more column filters
1341+
Search table schema with column filters, optionally sort results
13341342
"""
13351343

13361344
filters: List[ColumnFilter] = Field(
1337-
description="Column filters to apply when searching",
1338-
)
1339-
1340-
start_index: StrictInt = Field(
1341-
description="Index (starting from zero) of first result to fetch (for paging)",
1345+
description="Column filters to apply when searching, can be empty",
13421346
)
13431347

1344-
max_results: StrictInt = Field(
1345-
description="Maximum number of resulting column schemas to fetch from the start index",
1348+
sort_order: SearchSchemaSortOrder = Field(
1349+
description="How to sort results: original in-schema order, alphabetical ascending or descending",
13461350
)
13471351

13481352

13491353
class SearchSchemaRequest(BaseModel):
13501354
"""
1351-
Search full, unfiltered table schema for column names matching one or
1352-
more column filters
1355+
Search table schema with column filters, optionally sort results
13531356
"""
13541357

13551358
params: SearchSchemaParams = Field(

extensions/positron-python/python_files/posit/positron/tests/test_data_explorer.py

Lines changed: 89 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -394,13 +394,12 @@ def get_schema(self, table_name, column_indices=None):
394394
column_indices=column_indices,
395395
)["columns"]
396396

397-
def search_schema(self, table_name, filters, start_index, max_results):
397+
def search_schema(self, table_name, filters, sort_order="original"):
398398
return self.do_json_rpc(
399399
table_name,
400400
"search_schema",
401401
filters=filters,
402-
start_index=start_index,
403-
max_results=max_results,
402+
sort_order=sort_order,
404403
)
405404

406405
def get_state(self, table_name):
@@ -907,11 +906,11 @@ def _match_types_filter(data_types):
907906
def test_search_schema(dxf: DataExplorerFixture):
908907
# Test search_schema RPC for pandas and polars
909908

910-
# Make a few thousand column names we can search for
909+
# Make a smaller set of column names for easier testing
911910
column_names = [
912911
f"{prefix}_{i}"
913-
for prefix in ["aaa", "bbb", "ccc", "ddd"]
914-
for i in range({"aaa": 1000, "bbb": 100, "ccc": 50, "ddd": 10}[prefix])
912+
for prefix in ["apple", "banana", "cherry", "date"]
913+
for i in range({"apple": 10, "banana": 5, "cherry": 3, "date": 2}[prefix])
915914
]
916915

917916
data_examples = {
@@ -939,54 +938,94 @@ def test_search_schema(dxf: DataExplorerFixture):
939938
dxf.register_table("test_df", test_df)
940939
dxf.register_table("dfp", dfp)
941940

942-
aaa_filter = _text_search_filter("aaa")
943-
bbb_filter = _text_search_filter("bbb")
944-
ccc_filter = _text_search_filter("ccc")
945-
ddd_filter = _text_search_filter("ddd")
941+
apple_filter = _text_search_filter("apple")
942+
banana_filter = _text_search_filter("banana")
946943

947944
for name in ["test_df", "dfp"]:
948-
full_schema = dxf.get_schema(name, list(range(len(column_names))))
945+
# Test filtering by text
946+
result = dxf.search_schema(name, [apple_filter])
947+
expected_apple_indices = [i for i, col in enumerate(column_names) if "apple" in col]
948+
assert result["matches"] == expected_apple_indices
949+
950+
result = dxf.search_schema(name, [banana_filter])
951+
expected_banana_indices = [i for i, col in enumerate(column_names) if "banana" in col]
952+
assert result["matches"] == expected_banana_indices
953+
954+
# Test filtering by data type
955+
string_filter = _match_types_filter([ColumnDisplayType.String])
956+
result = dxf.search_schema(name, [string_filter])
957+
# String columns should be at indices 1, 6, 11, 16 (every 5th starting from 1)
958+
expected_string_indices = [i for i in range(len(column_names)) if i % 5 == 1]
959+
assert result["matches"] == expected_string_indices
960+
961+
# Test combining filters
962+
result = dxf.search_schema(name, [apple_filter, string_filter])
963+
# Apple columns that are also strings
964+
expected_combined = [i for i in expected_apple_indices if i % 5 == 1]
965+
assert result["matches"] == expected_combined
966+
967+
# Test sorting
968+
result = dxf.search_schema(name, [], "original")
969+
expected_all_indices = list(range(len(column_names)))
970+
assert result["matches"] == expected_all_indices
971+
972+
result = dxf.search_schema(name, [], "ascending")
973+
# Should be sorted by column name alphabetically
974+
expected_sorted = sorted(range(len(column_names)), key=lambda i: column_names[i])
975+
assert result["matches"] == expected_sorted
976+
977+
result = dxf.search_schema(name, [], "descending")
978+
# Should be sorted by column name reverse alphabetically
979+
expected_reverse_sorted = sorted(
980+
range(len(column_names)), key=lambda i: column_names[i], reverse=True
981+
)
982+
assert result["matches"] == expected_reverse_sorted
949983

950-
# (search_term, start_index, max_results, ex_total, ex_matches)
951-
cases = [
952-
([aaa_filter], 0, 100, 1000, full_schema[:100]),
953-
(
954-
[aaa_filter, _match_types_filter([ColumnDisplayType.String])],
955-
0,
956-
100,
957-
200,
958-
full_schema[:500][1::5],
959-
),
960-
(
961-
[
962-
aaa_filter,
963-
_match_types_filter([ColumnDisplayType.Boolean, ColumnDisplayType.Number]),
964-
],
965-
0,
966-
120,
967-
600,
968-
[x for i, x in enumerate(full_schema[:200]) if i % 5 in (0, 2, 3)],
969-
),
970-
([aaa_filter], 100, 100, 1000, full_schema[100:200]),
971-
([aaa_filter], 950, 100, 1000, full_schema[950:1000]),
972-
([aaa_filter], 1000, 100, 1000, []),
973-
([bbb_filter], 0, 10, 100, full_schema[1000:1010]),
974-
([ccc_filter], 0, 10, 50, full_schema[1100:1110]),
975-
([ddd_filter], 0, 10, 10, full_schema[1150:1160]),
976-
]
977984

978-
for (
979-
filters,
980-
start_index,
981-
max_results,
982-
ex_total,
983-
ex_matches,
984-
) in cases:
985-
result = dxf.search_schema(name, filters, start_index, max_results)
986-
987-
assert result["total_num_matches"] == ex_total
988-
matches = result["matches"]["columns"]
989-
assert matches == ex_matches
985+
def test_search_schema_sort_by_name(dxf: DataExplorerFixture):
986+
# Test comprehensive sort-by-name functionality
987+
988+
# Create a dataframe with deliberately mixed-case and varied column names
989+
column_names = ["Zebra", "apple", "BANANA", "Cherry", "date", "Elephant", "fig"]
990+
data = {name: [1, 2, 3, 4, 5] for name in column_names}
991+
992+
test_df = pd.DataFrame(data)
993+
dfp = pl.DataFrame(data)
994+
995+
dxf.register_table("sort_test_df", test_df)
996+
dxf.register_table("sort_test_dfp", dfp)
997+
998+
for name in ["sort_test_df", "sort_test_dfp"]:
999+
# Test original order (should be same as column order)
1000+
result = dxf.search_schema(name, [], "original")
1001+
expected_original = list(range(len(column_names)))
1002+
assert result["matches"] == expected_original
1003+
1004+
# Test ascending sort (case-sensitive alphabetical)
1005+
result = dxf.search_schema(name, [], "ascending")
1006+
expected_ascending = sorted(range(len(column_names)), key=lambda i: column_names[i])
1007+
assert result["matches"] == expected_ascending
1008+
1009+
# Test descending sort
1010+
result = dxf.search_schema(name, [], "descending")
1011+
expected_descending = sorted(
1012+
range(len(column_names)), key=lambda i: column_names[i], reverse=True
1013+
)
1014+
assert result["matches"] == expected_descending
1015+
1016+
# Test that sorting works with filters too
1017+
filter_with_a = _text_search_filter("a") # Should match "Zebra", "apple", "BANANA"
1018+
1019+
result = dxf.search_schema(name, [filter_with_a], "ascending")
1020+
filtered_indices = [i for i, col in enumerate(column_names) if "a" in col.lower()]
1021+
expected_filtered_ascending = sorted(filtered_indices, key=lambda i: column_names[i])
1022+
assert result["matches"] == expected_filtered_ascending
1023+
1024+
result = dxf.search_schema(name, [filter_with_a], "descending")
1025+
expected_filtered_descending = sorted(
1026+
filtered_indices, key=lambda i: column_names[i], reverse=True
1027+
)
1028+
assert result["matches"] == expected_filtered_descending
9901029

9911030

9921031
def test_pandas_get_data_values(dxf: DataExplorerFixture):

0 commit comments

Comments
 (0)