Skip to content

Commit

Permalink
fix failing unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 16, 2022
1 parent fa0d862 commit a75f33e
Show file tree
Hide file tree
Showing 9 changed files with 39 additions and 377 deletions.
58 changes: 6 additions & 52 deletions apis/python/src/tiledbsoma/annotation_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def __init__(
"""
assert name in ["obs", "var"]
super().__init__(uri=uri, name=name, parent=parent)
s0 = self.timing_start("__init__", "total")
self.dim_name = name + "_id"
self.timing_end(s0)

# ----------------------------------------------------------------
def shape(self) -> Tuple[int, int]:
Expand All @@ -42,10 +40,7 @@ def shape(self) -> Tuple[int, int]:
The row-count is the number of obs_ids (for ``obs``) or the number of var_ids (for ``var``).
The column-count is the number of columns/attributes in the dataframe.
"""
s0 = self.timing_start("shape", "total")
s1 = self.timing_start("shape", "open")
with self._open("r") as A:
self.timing_end(s1)
self.dim_name = A.domain.dim(0).name
# These TileDB arrays are string-dimensioned sparse arrays so there is no '.shape'.
# Instead we compute it ourselves. See also:
Expand All @@ -71,32 +66,22 @@ def shape(self) -> Tuple[int, int]:
].tolist()
)
num_cols = A.schema.nattr
self.timing_end(s0)
return (num_rows, num_cols)

# ----------------------------------------------------------------
def ids(self) -> Sequence[str]:
"""
Returns the ``obs_ids`` in the matrix (for ``obs``) or the ``var_ids`` (for ``var``).
"""
s0 = self.timing_start("ids", "total")
s1 = self.timing_start("ids", "open")
with self._open("r") as A:
self.timing_end(s1)
self.dim_name = A.domain.dim(0).name

s2 = self.timing_start("ids", "tiledb_query")
# TileDB string dims are ASCII not UTF-8. Decode them so they readback not like
# `b"AKR1C3"` but rather like `"AKR1C3"`. Update as of
# https://github.com/TileDB-Inc/TileDB-Py/pull/1304 these dims will read back OK.
retval = A.query(attrs=[], dims=[self.dim_name])[:][self.dim_name].tolist()
self.timing_end(s2)

s3 = self.timing_start("ids", "decode")
retval = [e.decode() for e in retval]
self.timing_end(s3)

self.timing_end(s0)

if len(retval) > 0 and isinstance(retval[0], bytes):
return [e.decode() for e in retval]
Expand Down Expand Up @@ -124,10 +109,7 @@ def keys(self) -> Sequence[str]:
Returns the column names for the ``obs`` or ``var`` dataframe. For obs and varp, ``.keys()`` is a
keystroke-saver for the more general array-schema accessor ``attr_names``.
"""
s0 = self.timing_start("keys", "total")
retval = self.attr_names()
self.timing_end(s0)
return retval
return self.attr_names()

# ----------------------------------------------------------------
def keyset(self) -> Set[str]:
Expand All @@ -149,18 +131,13 @@ def dim_select(
``var``). If ``ids`` is ``None``, the entire dataframe is returned. Similarly, if ``attrs`` are
provided, they're used for the query; else, all attributes are returned.
"""
s0 = self.timing_start("dim_select", "total")
s1 = self.timing_start("dim_select", "open")
with self._open("r") as A:
self.timing_end(s1)
self.dim_name = A.domain.dim(0).name
s2 = self.timing_start("dim_select", "tiledb_query")
query = A.query(return_arrow=return_arrow, attrs=attrs)
if ids is None:
df = query.df[:]
else:
df = query.df[ids]
self.timing_end(s2)

# We do not need this:
# df.set_index(self.dim_name, inplace=True)
Expand All @@ -171,24 +148,17 @@ def dim_select(
# so the set_index is already done for us.
#
# However if the data was written somehow else (e.g. by tiledbsoma-r) then we do.
s3 = self.timing_start("dim_select", "set_index")
if not return_arrow:
if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns:
df.set_index(self.dim_name, inplace=True)
self.timing_end(s3)

# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
# This is the 'decode on read' part of our logic; in from_dataframe we have the 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
s4 = self.timing_start("dim_select", "ascii_to_unicode")
if return_arrow:
retval = self._ascii_to_unicode_arrow_readback(df)
return self._ascii_to_unicode_arrow_readback(df)
else:
retval = self._ascii_to_unicode_pandas_readback(df)
self.timing_end(s4)

self.timing_end(s0)
return retval
return self._ascii_to_unicode_pandas_readback(df)

# ----------------------------------------------------------------
def df(
Expand Down Expand Up @@ -223,12 +193,9 @@ def query(
if query_string is None:
return self.dim_select(ids, attrs=attrs, return_arrow=return_arrow)

s0 = self.timing_start("query", "total")
retval = self._query_aux(
return self._query_aux(
query_string=query_string, ids=ids, attrs=attrs, return_arrow=return_arrow
)
self.timing_end(s0)
return retval

def _query_aux(
self,
Expand All @@ -243,11 +210,8 @@ def _query_aux(
elapsed-time stats in a call to this helper.
"""

s1 = self.timing_start("query", "open")
with self._open() as A:
self.timing_end(s1)
self.dim_name = A.domain.dim(0).name
s2 = self.timing_start("query", "tiledb_query")
qc = tiledb.QueryCondition(query_string)
if attrs is None:
slice_query = A.query(attr_cond=qc, return_arrow=return_arrow)
Expand All @@ -263,7 +227,6 @@ def _query_aux(
df = slice_query.df[:]
else:
df = slice_query.df[ids]
self.timing_end(s2)

# We do not need this:
# df.set_index(self.dim_name, inplace=True)
Expand All @@ -274,22 +237,16 @@ def _query_aux(
# so the set_index is already done for us.
#
# However if the data was written somehow else (e.g. by tiledbsoma-r) then we do.
s3 = self.timing_start("query", "set_index")
if not return_arrow:
if isinstance(df.index, pd.RangeIndex) and self.dim_name in df.columns:
df.set_index(self.dim_name, inplace=True)
# This is the 'decode on read' part of our logic; in dim_select we have the 'encode on write' part.
# Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
self.timing_end(s3)

s4 = self.timing_start("query", "ascii_to_unicode")
if return_arrow:
retval = self._ascii_to_unicode_arrow_readback(df)
return self._ascii_to_unicode_arrow_readback(df)
else:
retval = self._ascii_to_unicode_pandas_readback(df)
self.timing_end(s4)

return retval
return self._ascii_to_unicode_pandas_readback(df)

# ----------------------------------------------------------------
def _ascii_to_unicode_pandas_series_readback(
Expand Down Expand Up @@ -378,7 +335,6 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
:param dataframe: ``anndata.obs``, ``anndata.var``, ``anndata.raw.var``.
:param extent: TileDB ``extent`` parameter for the array schema.
"""
s0 = self.timing_start("from_dataframe", "total")

offsets_filters = tiledb.FilterList(
[tiledb.PositiveDeltaFilter(), tiledb.ZstdFilter(level=-1)]
Expand Down Expand Up @@ -474,5 +430,3 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
f"Wrote {self.nested_name}",
util.format_elapsed(s, f"{self._indent}FINISH WRITING {self.uri}"),
)

self.timing_end(s0)
14 changes: 0 additions & 14 deletions apis/python/src/tiledbsoma/annotation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def shape(self) -> Tuple[int, int]:
Note: currently implemented via data scan --- will be optimized in an upcoming TileDB Core release.
"""
s0 = self.timing_start("shape", "total")
with self._open() as A:
# These TileDB arrays are string-dimensioned sparse arrays so there is no '.shape'.
# Instead we compute it ourselves. See also:
Expand All @@ -65,7 +64,6 @@ def shape(self) -> Tuple[int, int]:
].tolist()
)
num_cols = A.schema.nattr
self.timing_end(s0)
return (num_rows, num_cols)

# ----------------------------------------------------------------
Expand All @@ -79,7 +77,6 @@ def dim_select(
Selects a slice out of the array with specified ``obs_ids`` (for ``obsm`` elements) or
``var_ids`` (for ``varm`` elements). If ``ids`` is ``None``, the entire array is returned.
"""
s0 = self.timing_start("dim_select", "total")
if ids is None:
with self._open() as A:
query = A.query(return_arrow=return_arrow)
Expand All @@ -90,7 +87,6 @@ def dim_select(
df = query.df[ids]
if not return_arrow:
df.set_index(self.dim_name, inplace=True)
self.timing_end(s0)
return df

# ----------------------------------------------------------------
Expand All @@ -116,8 +112,6 @@ def from_matrix_and_dim_values(
:param matrix: ``anndata.obsm['foo']``, ``anndata.varm['foo']``, or ``anndata.raw.varm['foo']``.
:param dim_values: ``anndata.obs_names``, ``anndata.var_names``, or ``anndata.raw.var_names``.
"""
s0 = self.timing_start("from_matrix_and_dim_values", "total")

s = util.get_start_stamp()
log_io(None, f"{self._indent}START WRITING {self.uri}")

Expand All @@ -132,13 +126,11 @@ def from_matrix_and_dim_values(
f"Wrote {self.nested_name}",
util.format_elapsed(s, f"{self._indent}FINISH WRITING {self.uri}"),
)
self.timing_end(s0)

# ----------------------------------------------------------------
def _numpy_ndarray_or_scipy_sparse_csr_matrix(
self, matrix: Matrix, dim_values: Labels
) -> None:
s0 = self.timing_start("_numpy_ndarray_or_scipy_sparse_csr_matrix", "total")
# We do not have column names for anndata-provenance annotation matrices.
# So, if say we're looking at anndata.obsm['X_pca'], we create column names
# 'X_pca_1', 'X_pca_2', etc.
Expand All @@ -154,11 +146,9 @@ def _numpy_ndarray_or_scipy_sparse_csr_matrix(
df = pd.DataFrame(matrix, columns=attr_names)
with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
A[dim_values] = df.to_dict(orient="list")
self.timing_end(s0)

# ----------------------------------------------------------------
def _from_pandas_dataframe(self, df: pd.DataFrame, dim_values: Labels) -> None:
s0 = self.timing_start("_from_pandas_dataframe", "total")
attr_names = df.columns.values.tolist()

# Ingest annotation matrices as 1D/multi-attribute sparse arrays
Expand All @@ -169,7 +159,6 @@ def _from_pandas_dataframe(self, df: pd.DataFrame, dim_values: Labels) -> None:

with tiledb.open(self.uri, mode="w", ctx=self._ctx) as A:
A[dim_values] = df.to_dict(orient="list")
self.timing_end(s0)

# ----------------------------------------------------------------
def _create_empty_array(
Expand All @@ -182,7 +171,6 @@ def _create_empty_array(
repeated once per column. For pandas.DataFrame, there is a dtype per column.
:param attr_names: column names for the dataframe
"""
s0 = self.timing_start("_create_empty_array", "total")

# Nominally 'obs_id' or 'var_id'
level = self._soma_options.string_dim_zstd_level
Expand Down Expand Up @@ -225,5 +213,3 @@ def _create_empty_array(
)

tiledb.Array.create(self.uri, sch, ctx=self._ctx)

self.timing_end(s0)
Loading

0 comments on commit a75f33e

Please sign in to comment.