Skip to content

Commit

Permalink
Merge pull request #9 from ameliefroessl/subsample_mask
Browse files Browse the repository at this point in the history
Allowing boolean array in delayed_subsample
  • Loading branch information
rhugonnet authored May 22, 2024
2 parents a9fe770 + 31c7f39 commit 3fd3691
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 15 deletions.
41 changes: 28 additions & 13 deletions geoutils/raster/delayed.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Module for dask-delayed functions for out-of-memory raster operations.
"""

from __future__ import annotations

import warnings
Expand All @@ -15,7 +16,7 @@
from dask.utils import cached_cumsum
from scipy.interpolate import interpn

from geoutils._typing import NDArrayNum
from geoutils._typing import NDArrayBool, NDArrayNum
from geoutils.projtools import _get_bounds_projected, _get_footprint_projected

# 1/ SUBSAMPLING
Expand Down Expand Up @@ -96,28 +97,37 @@ def _get_indices_block_per_subsample(


@dask.delayed # type: ignore
def _delayed_nb_valids(arr_chunk: NDArrayNum) -> NDArrayNum:
def _delayed_nb_valids(arr_chunk: NDArrayNum | NDArrayBool) -> NDArrayNum:
"""Count number of valid values per block."""
if arr_chunk.dtype == "bool":
return np.array([np.count_nonzero(arr_chunk)]).reshape((1, 1))
return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))


@dask.delayed # type: ignore
def _delayed_subsample_block(arr_chunk: NDArrayNum, subsample_indices: NDArrayNum) -> NDArrayNum:
def _delayed_subsample_block(
arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum
) -> NDArrayNum | NDArrayBool:
"""Subsample the valid values at the corresponding 1D valid indices per block."""

s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]

return s_chunk
if arr_chunk.dtype == "bool":
return arr_chunk[arr_chunk][subsample_indices]
return arr_chunk[np.isfinite(arr_chunk)][subsample_indices]


@dask.delayed # type: ignore
def _delayed_subsample_indices_block(
arr_chunk: NDArrayNum, subsample_indices: NDArrayNum, block_id: dict[str, Any]
arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum, block_id: dict[str, Any]
) -> NDArrayNum:
"""Return 2D indices from the subsampled 1D valid indices per block."""

# Unravel indices of valid data to the shape of the block
ix, iy = np.unravel_index(np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape)
if arr_chunk.dtype == "bool":
ix, iy = np.unravel_index(np.argwhere(arr_chunk.flatten())[subsample_indices], shape=arr_chunk.shape)
else:
# Unravel indices of valid data to the shape of the block
ix, iy = np.unravel_index(
np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape
)

# Convert to full-array indexes by adding the row and column starting indexes for this block
ix += block_id["xstart"]
Expand Down Expand Up @@ -151,7 +161,10 @@ def delayed_subsample(
return_indices=True, then sample your arrays out-of-memory with .vindex[indices[0], indices[1]]
(this assumes that these arrays have valid values at the same locations).
:param darr: Input dask array.
Only valid values are sampled. If passing a numerical array, then only finite values are considered valid values.
If passing a boolean array, then only True values are considered valid values.
:param darr: Input dask array. This can be a boolean or a numerical array.
:param subsample: Subsample size. If <= 1, will be considered a fraction of valid pixels to extract.
If > 1 will be considered the number of valid pixels to extract.
:param return_indices: If set to True, will return the extracted indices only.
Expand Down Expand Up @@ -722,9 +735,11 @@ def delayed_reproject(
# transform of each tuples of source blocks
src_block_ids = np.array(src_geotiling.get_block_locations())
meta_params = [
_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
if len(sbid) > 0
else ({}, [])
(
_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
if len(sbid) > 0
else ({}, [])
)
for sbid in dest2source
]
# We also add the output transform/shape for this destination chunk in the combined meta
Expand Down
22 changes: 20 additions & 2 deletions tests/test_raster/test_delayed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for dask-delayed functions."""

from __future__ import annotations

import os
Expand Down Expand Up @@ -292,10 +293,18 @@ class TestDelayed:
# List of in-memory chunksize for small tests
list_small_chunksizes_in_mem = [(10, 10), (7, 19)]

@pytest.mark.parametrize("darr", list_small_darr) # type: ignore
# create a corresponding boolean array for each numerical dask array
# every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
darr_bool = []
for small_darr in list_small_darr:
darr_bool.append(da.where(da.isfinite(small_darr), True, False))

@pytest.mark.parametrize("darr, darr_bool", list(zip(list_small_darr, darr_bool))) # type: ignore
@pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem) # type: ignore
@pytest.mark.parametrize("subsample_size", [2, 100, 100000]) # type: ignore
def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int):
def test_delayed_subsample__output(
self, darr: da.Array, darr_bool: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int
):
"""
Checks for delayed subsampling function for output accuracy.
Variables that influence specifically the delayed function and might lead to new errors are:
Expand All @@ -319,6 +328,15 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
sub2 = np.array(darr.vindex[indices[0], indices[1]])
assert np.array_equal(sub, sub2)

# Finally, to verify that a boolean array, with valid values at the same locations as the numerical array,
# leads to the same results, we compare the samples values and the samples indices.
darr_bool = darr_bool.rechunk(chunksizes_in_mem)
indices_bool = delayed_subsample(darr_bool, subsample=subsample_size, random_state=42, return_indices=True)
sub_bool = np.array(darr.vindex[indices_bool])
assert np.array_equal(sub, sub_bool)
assert np.array_equal(indices, indices_bool)


@pytest.mark.parametrize("darr", list_small_darr) # type: ignore
@pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem) # type: ignore
@pytest.mark.parametrize("ninterp", [2, 100]) # type: ignore
Expand Down

0 comments on commit 3fd3691

Please sign in to comment.