Skip to content

Commit

Permalink
[c++] Add SOMAArray::config_options_from_schema getter (#3146)
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenv authored Oct 8, 2024
1 parent 50d63c1 commit 04d40bf
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 22 deletions.
11 changes: 11 additions & 0 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,17 @@ class SOMAArray : public SOMAObject {
ctx_->tiledb_ctx(), arr_);
}

/**
* @brief Get members of the schema (capacity, allows_duplicates,
* tile_order, cell_order, offsets_filters, validity_filters, attr filters,
* and dim filters) in the form of a PlatformConfig
*
* @return PlatformConfig
*/
PlatformConfig config_options_from_schema() const {
return ArrowAdapter::platform_config_from_tiledb_schema(*mq_->schema());
}

/**
* @brief Get the mapping of attributes to Enumerations.
*
Expand Down
146 changes: 146 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,152 @@ void ArrowAdapter::release_array(struct ArrowArray* array) {
LOG_TRACE(fmt::format("[ArrowAdapter] release_array done"));
}

PlatformConfig ArrowAdapter::platform_config_from_tiledb_schema(
ArraySchema tiledb_schema) {
std::map<tiledb_layout_t, std::string> layout_as_string{
{TILEDB_ROW_MAJOR, "row-major"},
{TILEDB_COL_MAJOR, "column-major"},
{TILEDB_HILBERT, "hilbert"},
{TILEDB_UNORDERED, "unordered"},
};

PlatformConfig platform_config;
platform_config.capacity = tiledb_schema.capacity();
platform_config.allows_duplicates = tiledb_schema.allows_dups();
platform_config.tile_order = layout_as_string[tiledb_schema.tile_order()];
platform_config.cell_order = layout_as_string[tiledb_schema.cell_order()];
platform_config.offsets_filters = ArrowAdapter::_get_filter_list_json(
tiledb_schema.offsets_filter_list())
.dump();
platform_config.validity_filters = ArrowAdapter::_get_filter_list_json(
tiledb_schema.validity_filter_list())
.dump();
platform_config.attrs = ArrowAdapter::_get_attrs_filter_list_json(
tiledb_schema)
.dump();
platform_config
.dims = ArrowAdapter::_get_dims_filter_list_json(tiledb_schema).dump();

return platform_config;
}

json ArrowAdapter::_get_attrs_filter_list_json(ArraySchema tiledb_schema) {
json attrs_filter_list_as_json;
for (auto attr : tiledb_schema.attributes()) {
attrs_filter_list_as_json.emplace(
attr.first,
ArrowAdapter::_get_filter_list_json(attr.second.filter_list()));
}
return attrs_filter_list_as_json;
}

json ArrowAdapter::_get_dims_filter_list_json(ArraySchema tiledb_schema) {
json dims_filter_list_as_json;
for (auto dim : tiledb_schema.domain().dimensions()) {
dims_filter_list_as_json.emplace(
dim.name(), ArrowAdapter::_get_filter_list_json(dim.filter_list()));
}
return dims_filter_list_as_json;
}

json ArrowAdapter::_get_filter_list_json(FilterList filter_list) {
std::map<tiledb_filter_option_t, std::string> option_as_string = {
{TILEDB_COMPRESSION_LEVEL, "COMPRESSION_LEVEL"},
{TILEDB_BIT_WIDTH_MAX_WINDOW, "BIT_WIDTH_MAX_WINDOW"},
{TILEDB_POSITIVE_DELTA_MAX_WINDOW, "POSITIVE_DELTA_MAX_WINDOW"},
{TILEDB_SCALE_FLOAT_BYTEWIDTH, "SCALE_FLOAT_BYTEWIDTH"},
{TILEDB_SCALE_FLOAT_FACTOR, "SCALE_FLOAT_FACTOR"},
{TILEDB_SCALE_FLOAT_OFFSET, "SCALE_FLOAT_OFFSET"},
{TILEDB_WEBP_INPUT_FORMAT, "WEBP_INPUT_FORMAT"},
{TILEDB_WEBP_QUALITY, "WEBP_QUALITY"},
{TILEDB_WEBP_LOSSLESS, "WEBP_LOSSLESS"},
{TILEDB_COMPRESSION_REINTERPRET_DATATYPE,
"COMPRESSION_REINTERPRET_DATATYPE"},
};

json filter_list_as_json = {};
for (uint32_t i = 0; i < filter_list.nfilters(); ++i) {
json filter_as_json = {};

auto filter = filter_list.filter(i);
filter_as_json.emplace("name", Filter::to_str(filter.filter_type()));

switch (filter.filter_type()) {
case TILEDB_FILTER_GZIP:
case TILEDB_FILTER_ZSTD:
case TILEDB_FILTER_LZ4:
case TILEDB_FILTER_BZIP2:
case TILEDB_FILTER_RLE:
case TILEDB_FILTER_DICTIONARY:
filter_as_json.emplace(
"COMPRESSION_LEVEL",
filter.get_option<int32_t>(TILEDB_COMPRESSION_LEVEL));
break;

case TILEDB_FILTER_DELTA:
case TILEDB_FILTER_DOUBLE_DELTA:
filter_as_json.emplace(
"COMPRESSION_LEVEL",
filter.get_option<int32_t>(TILEDB_COMPRESSION_LEVEL));
filter_as_json.emplace(
"COMPRESSION_REINTERPRET_DATATYPE",
filter.get_option<uint8_t>(
TILEDB_COMPRESSION_REINTERPRET_DATATYPE));
break;

case TILEDB_FILTER_BIT_WIDTH_REDUCTION:
filter_as_json.emplace(
"BIT_WIDTH_MAX_WINDOW",
filter.get_option<uint32_t>(TILEDB_BIT_WIDTH_MAX_WINDOW));
break;

case TILEDB_FILTER_POSITIVE_DELTA:
filter_as_json.emplace(
"POSITIVE_DELTA_MAX_WINDOW",
filter.get_option<uint32_t>(
TILEDB_POSITIVE_DELTA_MAX_WINDOW));
break;

case TILEDB_FILTER_SCALE_FLOAT:
filter_as_json.emplace(
"SCALE_FLOAT_FACTOR",
filter.get_option<double>(TILEDB_SCALE_FLOAT_FACTOR));
filter_as_json.emplace(
"SCALE_FLOAT_OFFSET",
filter.get_option<double>(TILEDB_SCALE_FLOAT_OFFSET));
filter_as_json.emplace(
"SCALE_FLOAT_BYTEWIDTH",
filter.get_option<uint64_t>(TILEDB_SCALE_FLOAT_BYTEWIDTH));
break;

case TILEDB_FILTER_WEBP:
filter_as_json.emplace(
"WEBP_INPUT_FORMAT",
filter.get_option<uint8_t>(TILEDB_WEBP_INPUT_FORMAT));
filter_as_json.emplace(
"WEBP_QUALITY",
filter.get_option<float>(TILEDB_WEBP_QUALITY));
filter_as_json.emplace(
"WEBP_LOSSLESS",
filter.get_option<uint8_t>(TILEDB_WEBP_LOSSLESS));
break;

case TILEDB_FILTER_CHECKSUM_MD5:
case TILEDB_FILTER_CHECKSUM_SHA256:
case TILEDB_FILTER_XOR:
case TILEDB_FILTER_BITSHUFFLE:
case TILEDB_FILTER_BYTESHUFFLE:
case TILEDB_FILTER_DEPRECATED:
case TILEDB_FILTER_NONE:
// These filters have no options and are left empty
// intentionally
break;
}
filter_list_as_json.emplace_back(filter_as_json);
}
return filter_list_as_json;
}

std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array) {
auto tiledb_schema = tiledb_array->schema();
Expand Down
16 changes: 14 additions & 2 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ class ArrowAdapter {
static std::unique_ptr<ArrowSchema> arrow_schema_from_tiledb_array(
std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array);

/**
* @brief Get members of the TileDB Schema in the form of a PlatformConfig
*
* @return PlatformConfig
*/
static PlatformConfig platform_config_from_tiledb_schema(
ArraySchema tiledb_schema);

/**
* @brief Create a TileDB ArraySchema from ArrowSchema
*
Expand Down Expand Up @@ -712,6 +720,12 @@ class ArrowAdapter {

static tiledb_layout_t _get_order(std::string order);

static json _get_attrs_filter_list_json(ArraySchema tiledb_schema);

static json _get_dims_filter_list_json(ArraySchema tiledb_schema);

static json _get_filter_list_json(FilterList filter_list);

// Throws if the array and the schema don't have the same
// recursive child-counts.
static void _check_shapes(
Expand All @@ -725,8 +739,6 @@ class ArrowAdapter {
const ArrowTable& arrow_table,
int64_t column_index,
int64_t expected_n_buffers);

}; // class ArrowAdapter

}; // namespace tiledbsoma
#endif
68 changes: 48 additions & 20 deletions libtiledbsoma/test/unit_soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ TEST_CASE_METHOD(
"mem://unit-test-dataframe-platform-config");

PlatformConfig platform_config;
platform_config.cell_order = "hilbert";
platform_config.dataframe_dim_zstd_level = 6;
platform_config.offsets_filters = R"([)" + filter.first + R"(])";
platform_config.validity_filters = R"([)" + filter.first + R"(])";
Expand Down Expand Up @@ -366,6 +367,28 @@ TEST_CASE_METHOD(
.filter(0)
.filter_type() == filter.second);
}

auto config_options = sdf->config_options_from_schema();
REQUIRE(config_options.capacity == 100000);
REQUIRE(config_options.allows_duplicates == false);
REQUIRE(config_options.tile_order == "row-major");
REQUIRE(config_options.cell_order == "hilbert");

REQUIRE(
json::parse(config_options.offsets_filters)[0].at("name") ==
Filter::to_str(filter.second));
REQUIRE(
json::parse(config_options.validity_filters)[0].at("name") ==
Filter::to_str(filter.second));
if (filter.second != TILEDB_FILTER_WEBP) {
REQUIRE(
json::parse(config_options.attrs)["a0"][0].at("name") ==
Filter::to_str(filter.second));
}
REQUIRE(
json::parse(config_options.dims)["soma_joinid"][0].at("name") ==
Filter::to_str(TILEDB_FILTER_ZSTD));

sdf->close();
}
}
Expand All @@ -376,9 +399,9 @@ TEST_CASE_METHOD(
"SOMADataFrame: metadata",
"[SOMADataFrame]") {
auto use_current_domain = GENERATE(false, true);
// TODO this could be formatted with fmt::format which is part of internal
// header spd/log/fmt/fmt.h and should not be used. In C++20, this can be
// replaced with std::format.
// TODO this could be formatted with fmt::format which is part of
// internal header spd/log/fmt/fmt.h and should not be used. In
// C++20, this can be replaced with std::format.
std::ostringstream section;
section << "- use_current_domain=" << use_current_domain;
SECTION(section.str()) {
Expand Down Expand Up @@ -430,8 +453,8 @@ TEST_CASE_METHOD(
REQUIRE(
*((const int32_t*)std::get<MetadataInfo::value>(*mdval)) == 100);

// Delete and have it reflected when reading metadata while in write
// mode
// Delete and have it reflected when reading metadata while in
// write mode
sdf->delete_metadata("md");
mdval = sdf->get_metadata("md");
REQUIRE(!mdval.has_value());
Expand Down Expand Up @@ -600,8 +623,8 @@ TEST_CASE_METHOD(
auto new_shape = int64_t{SOMA_JOINID_RESIZE_DIM_MAX + 1};

if (!use_current_domain) {
// Domain is already set. The domain (not current domain but domain)
// is immutable. All we can do is check for:
// Domain is already set. The domain (not current domain but
// domain) is immutable. All we can do is check for:
// * throw on write beyond domain
// * throw on an attempt to resize.
REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX));
Expand Down Expand Up @@ -703,7 +726,8 @@ TEST_CASE_METHOD(
REQUIRE(check.first == false);
REQUIRE(
check.second ==
"testing: new soma_joinid shape 1 < existing shape 200");
"testing: new soma_joinid shape 1 < existing shape "
"200");
check = sdf->can_resize_soma_joinid_shape(
SOMA_JOINID_RESIZE_DIM_MAX + 1, "testing");
REQUIRE(check.first == true);
Expand Down Expand Up @@ -831,8 +855,8 @@ TEST_CASE_METHOD(
auto new_shape = int64_t{SOMA_JOINID_RESIZE_DIM_MAX + 1};

if (!use_current_domain) {
// Domain is already set. The domain (not current domain but domain)
// is immutable. All we can do is check for:
// Domain is already set. The domain (not current domain but
// domain) is immutable. All we can do is check for:
// * throw on write beyond domain
// * throw on an attempt to resize.
REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX));
Expand Down Expand Up @@ -934,7 +958,8 @@ TEST_CASE_METHOD(
REQUIRE(check.first == false);
REQUIRE(
check.second ==
"testing: new soma_joinid shape 1 < existing shape 200");
"testing: new soma_joinid shape 1 < existing shape "
"200");
check = sdf->can_resize_soma_joinid_shape(
SOMA_JOINID_RESIZE_DIM_MAX + 1, "testing");
REQUIRE(check.first == true);
Expand Down Expand Up @@ -1080,8 +1105,8 @@ TEST_CASE_METHOD(
auto new_shape = int64_t{SOMA_JOINID_RESIZE_DIM_MAX + 1};

if (!use_current_domain) {
// Domain is already set. The domain (not current domain but domain)
// is immutable. All we can do is check for:
// Domain is already set. The domain (not current domain but
// domain) is immutable. All we can do is check for:
// * throw on write beyond domain
// * throw on an attempt to resize.
REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX));
Expand Down Expand Up @@ -1181,7 +1206,8 @@ TEST_CASE_METHOD(
REQUIRE(check.first == false);
REQUIRE(
check.second ==
"testing: new soma_joinid shape 1 < existing shape 100");
"testing: new soma_joinid shape 1 < existing shape "
"100");
check = sdf->can_resize_soma_joinid_shape(
SOMA_JOINID_RESIZE_DIM_MAX + 1, "testing");
REQUIRE(check.first == true);
Expand Down Expand Up @@ -1296,8 +1322,8 @@ TEST_CASE_METHOD(

REQUIRE(sdf->nnz() == 2);
write_sjid_u32_str_data_from(8);
// soma_joinid is not a dim here and so the second write is an overwrite
// of the first here
// soma_joinid is not a dim here and so the second write is an
// overwrite of the first here
REQUIRE(sdf->nnz() == 2);

// Check shape after write
Expand All @@ -1310,9 +1336,10 @@ TEST_CASE_METHOD(
auto new_shape = int64_t{SOMA_JOINID_RESIZE_DIM_MAX + 1};

if (!use_current_domain) {
// Domain is already set. The domain (not current domain but domain)
// is immutable. All we can do is check for:
// * throw on write beyond domain -- except here, soma_joinid is not
// Domain is already set. The domain (not current domain but
// domain) is immutable. All we can do is check for:
// * throw on write beyond domain -- except here,
// soma_joinid is not
// a dim, so no throw
// * throw on an attempt to resize.

Expand All @@ -1336,7 +1363,8 @@ TEST_CASE_METHOD(
sdf->resize_soma_joinid_shape(new_shape, "testing");
sdf->close();

// Check shape after resize -- noting soma_joinid is not a dim here
// Check shape after resize -- noting soma_joinid is not a
// dim here
sdf = open(OpenMode::read);
actual = sdf->maybe_soma_joinid_shape();
REQUIRE(!actual.has_value());
Expand Down

0 comments on commit 04d40bf

Please sign in to comment.