Skip to content

Commit 9f4a1d5

Browse files
committed
Use new me_compile_nd/me_eval_nd for broadening scenarios for miniexpr
1 parent ac75725 commit 9f4a1d5

File tree

5 files changed

+78
-56
lines changed

5 files changed

+78
-56
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ set(MINIEXPR_BUILD_BENCH OFF CACHE BOOL "Build miniexpr benchmarks" FORCE)
5858

5959
FetchContent_Declare(miniexpr
6060
GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
61-
GIT_TAG sleef # latest SIMD additions
61+
GIT_TAG ndim # latest me_compile_nd()/me_eval_nd() APIs
6262
# In case you want to use a local copy of miniexpr for development, uncomment the line below
6363
# SOURCE_DIR "/Users/faltet/blosc/miniexpr"
6464
)

README_DEVELOPERS.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,9 @@ brew install sccache ninja
5959
Then run:
6060

6161
```bash
62-
CMAKE_GENERATOR=Ninja \
63-
CMAKE_C_COMPILER=clang \
64-
CMAKE_CXX_COMPILER=clang++ \
6562
CMAKE_C_COMPILER_LAUNCHER=sccache \
66-
CMAKE_CXX_COMPILER_LAUNCHER=sccache \
67-
CMAKE_BUILD_PARALLEL_LEVEL=8 \
68-
SKBUILD_PARALLEL_LEVEL=8 \
6963
SKBUILD_BUILD_DIR=build \
70-
pip install -e .
64+
pip install -e . --no-build-isolation
7165
```
7266

7367
Using `SKBUILD_BUILD_DIR` keeps a stable build directory between runs, which

src/blosc2/blosc2_ext.pyx

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,11 @@ cdef extern from "miniexpr.h":
573573
int me_compile(const char *expression, const me_variable *variables,
574574
int var_count, me_dtype dtype, int *error, me_expr **out)
575575

576+
int me_compile_nd(const char *expression, const me_variable *variables,
577+
int var_count, me_dtype dtype, int ndims,
578+
const int64_t *shape, const int32_t *chunkshape,
579+
const int32_t *blockshape, int *error, me_expr **out)
580+
576581
cdef enum me_compile_status:
577582
ME_COMPILE_SUCCESS
578583
ME_COMPILE_ERR_OOM
@@ -583,9 +588,26 @@ cdef extern from "miniexpr.h":
583588
ME_COMPILE_ERR_VAR_MIXED
584589
ME_COMPILE_ERR_VAR_UNSPECIFIED
585590
ME_COMPILE_ERR_INVALID_ARG_TYPE
591+
ME_COMPILE_ERR_MIXED_TYPE_NESTED
592+
593+
cdef enum me_simd_ulp_mode:
594+
ME_SIMD_ULP_DEFAULT
595+
ME_SIMD_ULP_1
596+
ME_SIMD_ULP_3_5
597+
598+
ctypedef struct me_eval_params:
599+
c_bool disable_simd
600+
me_simd_ulp_mode simd_ulp_mode
601+
602+
int me_eval(const me_expr *expr, const void **vars_block,
603+
int n_vars, void *output_block, int chunk_nitems,
604+
const me_eval_params *params) nogil
586605

587-
int me_eval(const me_expr *expr, const void ** vars_chunk,
588-
int n_vars, void *output_chunk, int chunk_nitems) nogil
606+
int me_eval_nd(const me_expr *expr, const void **vars_block,
607+
int n_vars, void *output_block, int block_nitems,
608+
int64_t nchunk, int64_t nblock, const me_eval_params *params) nogil
609+
610+
int me_nd_valid_nitems(const me_expr *expr, int64_t nchunk, int64_t nblock, int64_t *valid_nitems) nogil
589611

590612
void me_print(const me_expr *n) nogil
591613
void me_free(me_expr *n) nogil
@@ -1860,10 +1882,8 @@ cdef int general_filler(blosc2_prefilter_params *params):
18601882
return 0
18611883

18621884

1863-
# Auxiliary function for just miniexpr as a prefilter
1864-
# Only meant for (input and output) arrays that:
1865-
# 1) Are blosc2.NDArray objects
1866-
# 2) Do not have padding
1885+
# Auxiliary function for miniexpr as a prefilter
1886+
# Only meant for (input and output) arrays that are blosc2.NDArray objects.
18671887
cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
18681888
c_bool is_postfilter, uint8_t *params_output, int32_t typesize) nogil:
18691889
# Declare all C variables at the beginning
@@ -1880,9 +1900,29 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
18801900
cdef void* src
18811901
cdef int32_t chunk_nbytes, chunk_cbytes, block_nbytes
18821902
cdef int start, blocknitems, expected_blocknitems
1903+
cdef int64_t valid_nitems
18831904
cdef int32_t input_typesize
18841905
cdef blosc2_context* dctx
18851906
expected_blocknitems = -1
1907+
valid_nitems = 0
1908+
1909+
cdef me_expr* miniexpr_handle = udata.miniexpr_handle
1910+
cdef void* aux_reduc_ptr
1911+
1912+
if miniexpr_handle == NULL:
1913+
raise ValueError("miniexpr: handle not assigned")
1914+
1915+
# Query valid (unpadded) items for this block
1916+
rc = me_nd_valid_nitems(miniexpr_handle, nchunk, nblock, &valid_nitems)
1917+
if rc != 0:
1918+
raise RuntimeError(f"miniexpr: invalid block; error code: {rc}")
1919+
if valid_nitems <= 0:
1920+
# Nothing to compute for this block.
1921+
# For reductions, keep aux_reduc neutral values untouched.
1922+
if udata.aux_reduc_ptr == NULL:
1923+
memset(params_output, 0, udata.array.blocknitems * typesize)
1924+
free(input_buffers)
1925+
return 0
18861926
for i in range(udata.ninputs):
18871927
ndarr = udata.inputs[i]
18881928
input_buffers[i] = malloc(ndarr.sc.blocksize)
@@ -1912,48 +1952,35 @@ cdef int aux_miniexpr(me_udata *udata, int64_t nchunk, int32_t nblock,
19121952
# In the future, perhaps one can create a specific (serial) context just for
19131953
# blosc2_getitem_ctx, but this is probably never going to be necessary.
19141954
dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
1915-
if nchunk * ndarr.chunknitems + start + blocknitems > ndarr.nitems:
1916-
blocknitems = ndarr.nitems - (nchunk * ndarr.chunknitems + start)
1917-
if blocknitems <= 0:
1918-
# Should never happen, but anyway
1919-
continue
1955+
if valid_nitems > blocknitems:
1956+
raise ValueError("miniexpr: valid items exceed padded block size")
19201957
rc = blosc2_getitem_ctx(dctx, src, chunk_cbytes, start, blocknitems,
19211958
input_buffers[i], block_nbytes)
19221959
blosc2_free_ctx(dctx)
19231960
if rc < 0:
19241961
raise ValueError("miniexpr: error decompressing the chunk")
1925-
1926-
cdef me_expr* miniexpr_handle = udata.miniexpr_handle
1927-
cdef void* aux_reduc_ptr
19281962
# For reduction operations, we need to track which block we're processing
19291963
# The linear_block_index should be based on the INPUT array structure, not the output array
19301964
# Get the first input array's chunk and block structure
19311965
cdef b2nd_array_t* first_input = udata.inputs[0]
1932-
cdef int nblocks_per_chunk = (first_input.chunknitems + first_input.blocknitems - 1) // first_input.blocknitems
1966+
cdef int nblocks_per_chunk = 1
1967+
for i in range(first_input.ndim):
1968+
nblocks_per_chunk *= <int>udata.blocks_in_chunk[i]
19331969
# Calculate the global linear block index: nchunk * blocks_per_chunk + nblock
19341970
# This works because blocks never span chunks (chunks are padded to block boundaries)
19351971
cdef int64_t linear_block_index = nchunk * nblocks_per_chunk + nblock
19361972
cdef uintptr_t offset_bytes = typesize * linear_block_index
19371973

1938-
if miniexpr_handle == NULL:
1939-
raise ValueError("miniexpr: handle not assigned")
1940-
1941-
# Skip evaluation if blocknitems is invalid (can happen for padding blocks beyond data)
1942-
if blocknitems <= 0:
1943-
# Free resources
1944-
for i in range(udata.ninputs):
1945-
free(input_buffers[i])
1946-
free(input_buffers)
1947-
return 0
1948-
19491974
# Call thread-safe miniexpr C API
19501975
if udata.aux_reduc_ptr == NULL:
1951-
rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
1952-
<void*>params_output, blocknitems)
1976+
rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
1977+
<void*>params_output, blocknitems, nchunk, nblock, NULL)
19531978
else:
1954-
# Reduction operation
1979+
# Reduction operation: evaluate only valid items into a single output element.
1980+
# NOTE: miniexpr handles scalar outputs in me_eval_nd without touching tail bytes.
19551981
aux_reduc_ptr = <void *> (<uintptr_t> udata.aux_reduc_ptr + offset_bytes)
1956-
rc = me_eval(miniexpr_handle, <const void**>input_buffers, udata.ninputs, aux_reduc_ptr, blocknitems)
1982+
rc = me_eval_nd(miniexpr_handle, <const void**>input_buffers, udata.ninputs,
1983+
aux_reduc_ptr, blocknitems, nchunk, nblock, NULL)
19571984
if rc != 0:
19581985
raise RuntimeError(f"miniexpr: issues during evaluation; error code: {rc}")
19591986

@@ -2904,7 +2931,12 @@ cdef class NDArray:
29042931
expression = expression.encode("utf-8") if isinstance(expression, str) else expression
29052932
cdef me_dtype = me_dtype_from_numpy(self.dtype.num)
29062933
cdef me_expr *out_expr
2907-
cdef int rc = me_compile(expression, variables, n, me_dtype, &error, &out_expr)
2934+
cdef int ndims = self.array.ndim
2935+
cdef int64_t* shape = &self.array.shape[0]
2936+
cdef int32_t* chunkshape = &self.array.chunkshape[0]
2937+
cdef int32_t* blockshape = &self.array.blockshape[0]
2938+
cdef int rc = me_compile_nd(expression, variables, n, me_dtype, ndims,
2939+
shape, chunkshape, blockshape, &error, &out_expr)
29082940
if rc == ME_COMPILE_ERR_INVALID_ARG_TYPE:
29092941
raise TypeError(f"miniexpr does not support operand or output dtype: {expression}")
29102942
if rc != ME_COMPILE_SUCCESS:

src/blosc2/lazyexpr.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,14 +1284,14 @@ def fast_eval( # noqa: C901
12841284

12851285
# Check whether we can use miniexpr
12861286
if use_miniexpr:
1287-
# Avoid padding issues except for 1D arrays (contiguous along the only axis).
1288-
if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
1287+
# Require aligned NDArray operands with identical chunk/block grid.
1288+
same_shape = all(hasattr(op, "shape") and op.shape == shape for op in operands.values())
1289+
same_chunks = all(hasattr(op, "chunks") and op.chunks == chunks for op in operands.values())
1290+
same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
1291+
if not (same_shape and same_chunks and same_blocks):
1292+
use_miniexpr = False
1293+
if not (all_ndarray and not any_persisted and out is None):
12891294
use_miniexpr = False
1290-
for op in operands.values():
1291-
# Only NDArray in-memory operands
1292-
if not (isinstance(op, blosc2.NDArray) and op.urlpath is None and out is None):
1293-
use_miniexpr = False
1294-
break
12951295

12961296
if use_miniexpr:
12971297
cparams = kwargs.pop("cparams", blosc2.CParams())
@@ -1989,10 +1989,12 @@ def reduce_slices( # noqa: C901
19891989
if reduce_op in (ReduceOp.ARGMAX, ReduceOp.ARGMIN):
19901990
use_miniexpr = False
19911991

1992-
# Only behaved partitions are supported in miniexpr reductions
1992+
# Check whether we can use miniexpr
19931993
if use_miniexpr:
1994-
# Avoid padding issues except for 1D arrays (contiguous along the only axis).
1995-
if len(shape) != 1 and builtins.any(s % c != 0 for s, c in zip(shape[1:], chunks[1:], strict=True)):
1994+
same_shape = all(hasattr(op, "shape") and op.shape == shape for op in operands.values())
1995+
same_chunks = all(hasattr(op, "chunks") and op.chunks == chunks for op in operands.values())
1996+
same_blocks = all(hasattr(op, "blocks") and op.blocks == blocks for op in operands.values())
1997+
if not (same_shape and same_chunks and same_blocks):
19961998
use_miniexpr = False
19971999
if use_miniexpr and isinstance(expression, str):
19982000
has_complex = any(
@@ -2001,12 +2003,6 @@ def reduce_slices( # noqa: C901
20012003
)
20022004
if has_complex and any(tok in expression for tok in ("!=", "==", "<=", ">=", "<", ">")):
20032005
use_miniexpr = False
2004-
for op in operands.values():
2005-
# Ensure blocks fit exactly in chunks for the n-dim case, except for the first dimension
2006-
blocks_fit = builtins.all(c % b == 0 for c, b in zip(op.chunks[1:], op.blocks[1:], strict=True))
2007-
if len(op.shape) != 1 and not blocks_fit:
2008-
use_miniexpr = False
2009-
break
20102006

20112007
if use_miniexpr:
20122008
# Experiments say that not splitting is best (at least on Apple Silicon M4 Pro)

tests/ndarray/test_lazyexpr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ def test_expression_with_constants(array_fixture):
270270
a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
271271
# Test with operands with same chunks and blocks
272272
expr = a1 + 2 - a3 * 3.14
273-
nres = ne_evaluate("na1 + 2 - na3 * 3.14")
273+
nres = na1 + 2 - na3 * 3.14
274274
res = expr.compute()
275275
if na1.dtype == np.float32:
276276
np.testing.assert_allclose(res[:], nres, rtol=1e-5)

0 commit comments

Comments
 (0)