Skip to content

Commit

Permalink
Use true ASCII attributes in dataframes (#359)
Browse files Browse the repository at this point in the history
* remove util_arrow.concat_tables

* read/write Table

* read/write Table

* read/write Table

* Use true ASCII attributes in dataframes

* iterate on string vs large_string

* Fix ascii/binary issues in TileDB->Arrow and TileDB->Pandas->Arrows test cases

* more

* fix unit tests

Co-authored-by: John Kerl <[email protected]>
  • Loading branch information
johnkerl and johnkerl authored Oct 11, 2022
1 parent 36731f2 commit b589716
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 122 deletions.
49 changes: 8 additions & 41 deletions apis/python/src/tiledbsoma/soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pyarrow as pa
import tiledb

from . import util, util_arrow, util_pandas, util_tiledb
from . import util, util_arrow, util_tiledb
from .logging import log_io
from .soma_collection import SOMACollectionBase
from .tiledb_array import TileDBArray
Expand Down Expand Up @@ -205,11 +205,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down Expand Up @@ -275,7 +271,7 @@ def write(self, values: pa.Table) -> None:
if name != ROWID:
attr_cols_map[name] = np.asarray(
values.column(name).to_pandas(
types_mapper=util_arrow.tiledb_type_from_arrow_type,
types_mapper=util_arrow.tiledb_type_from_arrow_type_for_write,
)
)

Expand Down Expand Up @@ -343,11 +339,6 @@ def read_as_pandas(

for df in iterator:

# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on
# write' part.
# Context: https://github.com/single-cell-data/TileDB-SOMA/issues/99.
df = util_pandas.ascii_to_unicode_pandas_readback(df)

if id_column_name is not None:
df.reset_index(inplace=True)
df.set_index(id_column_name, inplace=True)
Expand Down Expand Up @@ -428,39 +419,15 @@ def write_from_pandas(

dataframe.set_index(ROWID, inplace=True)

# ISSUE:
#
# TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
# QueryCondition API. While this needs to be addressed -- global collaborators will want to
# write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
# to query, we need to store these as ASCII.
#
# This is (besides collation) a storage-level issue not a presentation-level issue: At write
# time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
# SOMA is an API: utf8-decode those strings when a query is done & give the user back
# "α,β,γ".
#
# CONTEXT:
# https://github.com/single-cell-data/TileDB-SOMA/issues/99
# https://github.com/single-cell-data/TileDB-SOMA/pull/101
# https://github.com/single-cell-data/TileDB-SOMA/issues/106
# https://github.com/single-cell-data/TileDB-SOMA/pull/117
#
# IMPLEMENTATION:
# Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
# Also, ``tiledb.from_pandas`` has ``column_types`` but that _forces_ things to string to a
# particular if they shouldn't be.
#
# Instead, we use ``dataframe.convert_dtypes`` to get a little jump on what ``tiledb.from_pandas``
# is going to be doing anyway, namely, type-inferring to see what is going to be a string.
#
# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
# Force ASCII storage if string, in order to make obs/var columns queryable.
# TODO: when UTF-8 attributes are fully supported we can remove this.
column_types = {}
for column_name in dataframe.keys():
dfc = dataframe[column_name]
if len(dfc) > 0 and type(dfc[0]) == str:
# Force ASCII storage if string, in order to make obs/var columns queryable.
column_types[column_name] = np.dtype("S")
column_types[column_name] = "ascii"
if len(dfc) > 0 and type(dfc[0]) == bytes:
column_types[column_name] = "bytes"

tiledb.from_pandas(
uri=self.uri,
Expand Down
6 changes: 1 addition & 5 deletions apis/python/src/tiledbsoma/soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,7 @@ def read(
iterator = query.df[ids]

for table in iterator:
# XXX COMMENT MORE
# This is the 'decode on read' part of our logic; in dim_select we have the
# 'encode on write' part.
# Context: # https://github.com/single-cell-data/TileDB-SOMA/issues/99.
yield util_arrow.ascii_to_unicode_pyarrow_readback(table)
yield table

def read_all(
self,
Expand Down
68 changes: 34 additions & 34 deletions apis/python/src/tiledbsoma/util_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,28 @@
of representing full type semantics, and correctly performing a
round trip conversion (eg, T == to_arrow(to_tiledb(T)))
Most primitive types are simple - eg, uint8. Of particular challenge
Most primitive types are simple -- e.g., uint8. Of particular challenge
are datetime/timestamps as TileDB has no distinction between a "datetime" and
a "timedelta". The best Arrow match is TimestampType, as long as that
TimestampType instance does NOT have a timezone set.
Because of our round-trip requirement, all other Arrow temporal types
are unsupported (even though they are just int64 under the covers).
We auto-promote Arrow's string and binary to large_string and large_binary,
respectively, as this is what TileDB stores -- a sequence of bytes preceded
by a 64-bit (not 32-bit) length int.
"""
ARROW_TO_TDB = {
# Dict of types unsupported by to_pandas_dtype, which require overrides.
# If the value is an instance of Exception, it will be raised.
#
# IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
#
pa.string(): np.dtype(
"S"
), # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): np.dtype("S"),
pa.string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_string(): "ascii", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.large_binary(): "bytes", # XXX TODO: temporary work-around until UTF8 support is native. GH #338.
pa.timestamp("s"): "datetime64[s]",
pa.timestamp("ms"): "datetime64[ms]",
pa.timestamp("us"): "datetime64[us]",
Expand All @@ -39,7 +43,21 @@
}


def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
def tiledb_type_from_arrow_type_for_write(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Same as ``tiledb_type_from_arrow_type`` except that this is used for writing to a TileDB array.
The syntax of TileDB-Py is such that when we want to create a schema with an ASCII column,
we use the string ``"ascii"`` in place of a dtype. But when we want to write data, we need to
use a dtype of ``np.str``, which is now deprecated in favor of simply ``str``.
"""
retval = tiledb_type_from_arrow_type(t)
if retval == "ascii":
return str
else:
return retval


def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype, str]:
"""
Given an Arrow type, return the corresponding TileDB type as a Numpy dtype.
Building block for Arrow-to-TileDB schema translation.
Expand All @@ -61,6 +79,10 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
arrow_type = ARROW_TO_TDB[t]
if isinstance(arrow_type, Exception):
raise arrow_type
if arrow_type == "ascii":
return arrow_type
if arrow_type == "bytes":
return arrow_type # np.int8()
return np.dtype(arrow_type)

if not pa.types.is_primitive(t):
Expand All @@ -83,15 +105,16 @@ def tiledb_type_from_arrow_type(t: pa.DataType) -> Union[type, np.dtype]:
raise TypeError("Unsupported Arrow type") from exc


def get_arrow_type_from_tiledb_dtype(tiledb_dtype: np.dtype) -> pa.DataType:
def get_arrow_type_from_tiledb_dtype(tiledb_dtype: Union[str, np.dtype]) -> pa.DataType:
"""
TODO: COMMENT
"""
if tiledb_dtype.name == "bytes":
if tiledb_dtype == "bytes":
return pa.large_binary()
if isinstance(tiledb_dtype, str) and tiledb_dtype == "ascii":
# XXX TODO: temporary work-around until UTF8 support is native. GH #338.
return pa.string()
else:
return pa.from_numpy_dtype(tiledb_dtype)
return pa.large_string()
return pa.from_numpy_dtype(tiledb_dtype)


def get_arrow_schema_from_tiledb_uri(
Expand Down Expand Up @@ -119,26 +142,3 @@ def get_arrow_schema_from_tiledb_uri(
arrow_schema_dict[name] = get_arrow_type_from_tiledb_dtype(attr.dtype)

return pa.schema(arrow_schema_dict)


def ascii_to_unicode_pyarrow_readback(table: pa.Table) -> pa.Table:
"""
Implements the 'decode on read' part of our ASCII/Unicode logic
"""
# TODO: COMMENT/LINK HEAVILY
names = [ofield.name for ofield in table.schema]
new_fields = []
for name in names:
old_field = table[name]
# Preferred syntax:
# if len(old_field) > 0 and pa.types.is_large_binary(old_field[0]):
# but:
# AttributeError: 'pyarrow.lib.UInt64Scalar' object has no attribute 'id'
if len(old_field) > 0 and isinstance(old_field[0], pa.LargeBinaryScalar):
nfield = pa.array(
[element.as_py().decode("utf-8") for element in old_field]
)
new_fields.append(nfield)
else:
new_fields.append(old_field)
return pa.Table.from_arrays(new_fields, names=names)
13 changes: 0 additions & 13 deletions apis/python/src/tiledbsoma/util_pandas.py

This file was deleted.

4 changes: 2 additions & 2 deletions apis/python/tests/test_soma_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def create_and_populate_dataframe(dataframe: soma.SOMADataFrame) -> None:
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)

Expand Down Expand Up @@ -108,7 +108,7 @@ def soma_object(request, tmp_path):

elif class_name == "SOMADataFrame":
so = soma.SOMADataFrame(uri=uri)
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))

elif class_name == "SOMAIndexedDataFrame":
so = soma.SOMAIndexedDataFrame(uri=uri)
Expand Down
38 changes: 29 additions & 9 deletions apis/python/tests/test_soma_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_soma_dataframe_non_indexed(tmp_path):
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)
sdf.create(schema=asch)
Expand Down Expand Up @@ -120,7 +120,7 @@ def simple_soma_data_frame(tmp_path):
("soma_rowid", pa.uint64()),
("A", pa.int64()),
("B", pa.float64()),
("C", pa.string()),
("C", pa.large_string()),
]
)
sdf = t.SOMADataFrame(uri=tmp_path.as_posix())
Expand Down Expand Up @@ -174,37 +174,57 @@ def test_SOMADataFrame_read_column_names(simple_soma_data_frame, ids, col_names)
schema, sdf, n_data = simple_soma_data_frame
assert sdf.exists()

def _check_tbl(tbl, col_names, ids):
def _check_tbl(tbl, col_names, ids, *, demote):
assert tbl.num_columns == (
len(schema.names) if col_names is None else len(col_names)
)
assert tbl.num_rows == (n_data if ids is None else len(ids))
assert tbl.schema == pa.schema(
[
schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)

if demote:
assert tbl.schema == pa.schema(
[
pa.field(schema.field(f).name, pa.string())
if schema.field(f).type == pa.large_string()
else schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)
else:
assert tbl.schema == pa.schema(
[
schema.field(f)
for f in (col_names if col_names is not None else schema.names)
]
)

# TileDB ASCII -> Arrow large_string
_check_tbl(
sdf.read_all(ids=ids, column_names=col_names),
col_names,
ids,
demote=False,
)

_check_tbl(
sdf.read_all(column_names=col_names),
col_names,
None,
demote=False,
)

# TileDB ASCII -> Pandas string -> Arrow string (not large_string)
_check_tbl(
pa.Table.from_pandas(
pd.concat(sdf.read_as_pandas(ids=ids, column_names=col_names))
),
col_names,
ids,
demote=True,
)

_check_tbl(
pa.Table.from_pandas(sdf.read_as_pandas_all(column_names=col_names)),
col_names,
None,
demote=True,
)
4 changes: 2 additions & 2 deletions apis/python/tests/test_soma_experiment_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def create_and_populate_obs(obs: soma.SOMADataFrame) -> soma.SOMADataFrame:
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.string()),
("baz", pa.large_string()),
]
)

Expand All @@ -37,7 +37,7 @@ def create_and_populate_var(var: soma.SOMADataFrame) -> soma.SOMADataFrame:

var_arrow_schema = pa.schema(
[
("quux", pa.string()),
("quux", pa.large_string()),
("xyzzy", pa.float64()),
]
)
Expand Down
10 changes: 9 additions & 1 deletion apis/python/tests/test_soma_indexed_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def _schema():
def test_soma_indexed_dataframe(tmp_path, arrow_schema):
sdf = t.SOMAIndexedDataFrame(uri=tmp_path.as_posix())

asch = pa.schema(
[
("foo", pa.int32()),
("bar", pa.float64()),
("baz", pa.large_string()),
]
)

# Create
asch = arrow_schema()
sdf.create(schema=asch, index_column_names=["foo"])
Expand Down Expand Up @@ -72,7 +80,7 @@ def simple_soma_indexed_data_frame(tmp_path):
("index", pa.uint64()),
("A", pa.int64()),
("B", pa.float64()),
("C", pa.string()),
("C", pa.large_string()),
]
)
index_column_names = ["index"]
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_soma_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def soma_object(request, tmp_path):

elif class_name == "SOMADataFrame":
so = soma.SOMADataFrame(uri=uri)
so.create(pa.schema([("A", pa.int32()), ("B", pa.string())]))
so.create(pa.schema([("A", pa.int32()), ("B", pa.large_string())]))

elif class_name == "SOMAIndexedDataFrame":
so = soma.SOMAIndexedDataFrame(uri=uri)
Expand Down
Loading

0 comments on commit b589716

Please sign in to comment.