From 383c96bf1b69958797ded146c280c0b7463917c0 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Wed, 24 Apr 2024 15:28:22 -0800
Subject: [PATCH 01/22] Add delayed functions written in xDEM*

---
 geoutils/raster/delayed.py                  | 653 ++++++++++++++++++++
 tests/{ => test_raster}/test_array.py       |   0
 tests/test_raster/test_delayed.py           | 154 +++++
 tests/{ => test_raster}/test_multiraster.py |   0
 tests/{ => test_raster}/test_raster.py      |   0
 tests/{ => test_raster}/test_sampling.py    |   0
 tests/{ => test_raster}/test_satimg.py      |   0
 7 files changed, 807 insertions(+)
 create mode 100644 geoutils/raster/delayed.py
 rename tests/{ => test_raster}/test_array.py (100%)
 create mode 100644 tests/test_raster/test_delayed.py
 rename tests/{ => test_raster}/test_multiraster.py (100%)
 rename tests/{ => test_raster}/test_raster.py (100%)
 rename tests/{ => test_raster}/test_sampling.py (100%)
 rename tests/{ => test_raster}/test_satimg.py (100%)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
new file mode 100644
index 00000000..38fd114e
--- /dev/null
+++ b/geoutils/raster/delayed.py
@@ -0,0 +1,653 @@
+"""
+Module for dask-delayed functions for out-of-memory raster operations.
+"""
+import warnings
+from typing import Any, Literal
+
+import numpy as np
+import dask.delayed
+import dask.array as da
+import pandas as pd
+import geopandas as gpd
+
+import rasterio as rio
+from scipy.interpolate import interpn
+
+from geoutils.projtools import _get_footprint_projected, _get_bounds_projected
+
+# 1/ SUBSAMPLING
+# Getting an exact subsample size out-of-memory only for valid values is not supported directly by Dask/Xarray
+
+# It is not trivial because we don't know where valid values will be in advance, and because of ragged output (varying 
+# output length considerations), which prevents from using high-level functions with good efficiency
+# We thus follow https://blog.dask.org/2021/07/02/ragged-output (the dask.array.map_blocks solution has a larger RAM 
+# usage by having to drop an axis and re-chunk along 1D of the 2D array, so we use the dask.delayed solution instead)
+
+def _random_state_from_user_input(random_state: np.random.RandomState | int | None = None) -> np.random.RandomState:
+    """Define random state based on varied user input."""
+
+    # Define state for random sampling (to fix results during testing)
+    # Not using the legacy random call is crucial for RAM usage: https://github.com/numpy/numpy/issues/14169
+    if random_state is None:
+        rnd: np.random.RandomState | np.random.Generator = np.random.default_rng()
+    elif isinstance(random_state, np.random.RandomState):
+        rnd = random_state
+    else:
+        rnd = np.random.default_rng(seed=42)
+
+    return rnd
+
+def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids: int) -> int:
+    """Get subsample size based on a user input of either integer size or fraction of the number of valid points."""
+
+    # If value is between 0 and 1, use a fraction
+    if (subsample <= 1) & (subsample > 0):
+        npoints = int(subsample * total_nb_valids)
+    # Otherwise use the value directly
+    elif subsample > 1:
+        # Use the number of valid points if larger than subsample asked by user
+        npoints = min(int(subsample), total_nb_valids)
+        if subsample > total_nb_valids:
+            warnings.warn(f"Subsample value of {subsample} is larger than the number of valid pixels of {total_nb_valids},"
+                          f"using all valid pixels as a subsample.", category=UserWarning)
+    else:
+        raise ValueError("Subsample must be > 0.")
+
+    return npoints
+
+def _get_indices_block_per_subsample(xxs: np.ndarray, num_chunks: tuple[int, int], nb_valids_per_block: list[int]) -> list[list[int]]:
+    """
+    Get list of 1D valid subsample indices relative to the block for each block.
+
+    The 1D valid subsample indices correspond to the subsample index to apply for a flattened array of valid values.
+    Relative to the block means converted so that the block indexes for valid values starts at 0 up to the number of
+    valid values in that block (while the input indices go from zero to the total number of valid values in the full
+    array).
+
+    :param xxs: Subsample 1D indexes among a total number of valid values.
+    :param num_chunks: Number of chunks in X and Y.
+    :param nb_valids_per_block: Number of valid pixels per block.
+
+    :returns: Relative 1D valid subsample index per block.
+    """
+
+    # Apply a cumulative sum to get the first 1D total index of each block
+    valids_cumsum = np.cumsum(nb_valids_per_block)
+
+    # We can write a faster algorithm by sorting
+    xxs = np.sort(xxs)
+
+    # We define a list of indices per block
+    relative_ind_per_block = [[] for _ in range((num_chunks[0] * num_chunks[1]))]
+    k = 0  # K is the block number
+    for x in xxs:
+
+        # Move to the next block K where current 1D subsample index is, if not in this one
+        while x >= valids_cumsum[k]:
+            k += 1
+
+        # Add 1D subsample index  relative to first subsample index of this block
+        first_xindex_block = valids_cumsum[k - 1] if k >= 1 else 0  # The first 1D valid subsample index of the block
+        relative_xindex = x - first_xindex_block
+        relative_ind_per_block[k].append(relative_xindex)
+
+    return relative_ind_per_block
+
+@dask.delayed
+def _delayed_nb_valids(arr_chunk: np.ndarray) -> np.ndarray:
+    """Count number of valid values per block."""
+    return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))
+
+@dask.delayed
+def _delayed_subsample_block(arr_chunk: np.ndarray, subsample_indices: np.ndarray) -> np.ndarray:
+    """Subsample the valid values at the corresponding 1D valid indices per block."""
+
+    s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
+
+    return s_chunk
+
+@dask.delayed
+def _delayed_subsample_indices_block(arr_chunk: np.ndarray, subsample_indices: np.ndarray, block_id: dict[str, Any]) -> np.ndarray:
+    """Return 2D indices from the subsampled 1D valid indices per block."""
+
+    # Unravel indices of valid data to the shape of the block
+    ix, iy = np.unravel_index(np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape)
+
+    # Convert to full-array indexes by adding the row and column starting indexes for this block
+    ix += block_id["xstart"]
+    iy += block_id["ystart"]
+
+    return np.hstack((ix, iy))
+
+def delayed_subsample(darr: da.Array,
+                      subsample: int | float = 1,
+                      return_indices: bool = False,
+                      random_state: np.random.RandomState | int | None = None) -> np.ndarray:
+    """
+    Subsample a raster at valid values on out-of-memory chunks.
+
+    Optionally, this function can return the 2D indices of the subsample of valid values instead.
+
+    The random subsample is distributed evenly across valid values, no matter which chunk they belong to.
+    First, the number of valid values in each chunk are computed out-of-memory. Then, a subsample is defined among
+    the total number of valid values, which are then indexed sequentially along the chunk valid values out-of-memory.
+
+    A random state will give a fixed subsample for a delayed array with a fixed chunksize. However, the subsample
+    will vary with changing chunksize because the 1D delayed indexing depends on it (indexing per valid value per
+    flattened chunk). For this reason, a loaded array will also have a different subsample due to its direct 1D
+    indexing (per valid value for the entire flattened array).
+
+    To ensure you re-use a similar subsample of valid values for several arrays, call this function with
+    return_indices=True, then sample your arrays out-of-memory with .vindex[indices[0], indices[1]]
+    (this assumes that these arrays have valid values at the same locations).
+
+    :param darr: Input dask array.
+    :param subsample: Subsample size. If <= 1, will be considered a fraction of valid pixels to extract.
+        If > 1 will be considered the number of valid pixels to extract.
+    :param return_indices: If set to True, will return the extracted indices only.
+    :param random_state: Random state, or seed number to use for random calculations.
+
+    :return: Subsample of values from the array (optionally, their indexes).
+    """
+
+    # Get random state
+    rnd = _random_state_from_user_input(random_state=random_state)
+
+    # Compute number of valid points for each block out-of-memory
+    blocks = darr.to_delayed().ravel()
+    list_delayed_valids = [da.from_delayed(_delayed_nb_valids(b), shape=(1, 1), dtype=np.dtype("int32")) for b in blocks]
+    nb_valids_per_block = np.concatenate([dask.compute(*list_delayed_valids)])
+
+    # Sum to get total number of valid points
+    total_nb_valids = np.sum(nb_valids_per_block)
+
+    # Get subsample size (depending on user input)
+    subsample_size = _get_subsample_size_from_user_input(subsample=subsample, total_nb_valids=total_nb_valids)
+
+    # Get random 1D indexes for the subsample size
+    indices_1d = rnd.choice(total_nb_valids, subsample_size, replace=False)
+
+    # Sort which indexes belong to which chunk
+    ind_per_block = _get_indices_block_per_subsample(indices_1d, num_chunks=darr.numblocks, nb_valids_per_block=nb_valids_per_block)
+
+    # To just get the subsample without indices
+    if not return_indices:
+        # Task a delayed subsample to be computed for each block, skipping blocks with no values to sample
+        list_subsamples = [
+            _delayed_subsample_block(b, ind)
+            for i, (b, ind) in enumerate(zip(blocks, ind_per_block))
+            if len(ind_per_block[i]) > 0
+        ]
+        # Cast output to the right expected dtype and length, then compute and concatenate
+        list_subsamples_delayed = [da.from_delayed(s, shape=(nb_valids_per_block[i]), dtype=darr.dtype)
+                                   for i, s in enumerate(list_subsamples)]
+        subsamples = np.concatenate(dask.compute(*list_subsamples_delayed), axis=0)
+
+        return subsamples
+
+    # To return indices
+    else:
+        # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
+        # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
+        from dask.utils import cached_cumsum
+        starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
+        num_chunks = darr.numblocks
+        # Get the starts per 1D block ID by unravelling starting indexes for each block
+        indexes_xi, indexes_yi = np.unravel_index(np.arange(len(blocks)), shape=(num_chunks[0], num_chunks[1]))
+        block_ids = [{"xstart": starts[0][indexes_xi[i]], "ystart": starts[1][indexes_yi[i]]} for i in range(len(blocks))]
+
+        # Task delayed subsample indices to be computed for each block, skipping blocks with no values to sample
+        list_subsample_indices = [
+            _delayed_subsample_indices_block(b, ind, block_id=block_ids[i])
+            for i, (b, ind) in enumerate(zip(blocks, ind_per_block))
+            if len(ind_per_block[i]) > 0
+        ]
+        # Cast output to the right expected dtype and length, then compute and concatenate
+        list_subsamples_indices_delayed = [da.from_delayed(s, shape=(2, len(ind_per_block[i])), dtype=np.dtype("int32"))
+                                           for i, s in enumerate(list_subsample_indices)]
+        indices = np.concatenate(dask.compute(*list_subsamples_indices_delayed), axis=0)
+
+        return indices[:, 0], indices[:, 1]
+
+
+# 2/ POINT INTERPOLATION ON REGULAR OR EQUAL GRID
+# This functionality is not covered efficiently by Dask/Xarray, because they need to support rectilinear grids, which
+# is difficult when interpolating in the chunked dimensions, and loads nearly all array memory when using .interp().
+
+# Here we harness the fact that rasters are always on regular (or sometimes equal) grids to efficiently map
+# the location of the blocks required for interpolation, which requires little memory usage.
+
+# Code structure inspired by https://blog.dask.org/2021/07/02/ragged-output and the "block_id" in map_blocks
+
+def _get_interp_indices_per_block(interp_x, interp_y, starts, num_chunks, chunksize, xres, yres):
+    """Map blocks where each pair of interpolation coordinates will have to be computed."""
+
+    # TODO 1: Check the robustness for chunksize different and X and Y
+
+    # TODO 2: Check if computing block_i_id matricially + using an == comparison (possibly delayed) to get index
+    #  per block is not more computationally efficient?
+    #  (as it uses array instead of nested lists, and nested lists grow in RAM very fast)
+
+    # We use one bucket per block, assuming a flattened blocks shape
+    ind_per_block = [[] for _ in range((num_chunks[0] * num_chunks[1]))]
+    for i, (x, y) in enumerate(zip(interp_x, interp_y)):
+        # Because it is a regular grid, we know exactly in which block ID the coordinate will fall
+        block_i_1d = int((x - starts[0][0]) / (xres * chunksize[0])) * num_chunks[1] + int((y - starts[1][0])/ (yres * chunksize[1]))
+        ind_per_block[block_i_1d].append(i)
+
+    return ind_per_block
+
+
+@dask.delayed
+def _delayed_interp_points_block(arr_chunk: np.ndarray, block_id: dict[str, Any], interp_coords: np.ndarray) -> np.ndarray:
+    """
+    Interpolate block in 2D out-of-memory for a regular or equal grid.
+    """
+
+    # Extract information out of block_id dictionary
+    xs, ys, xres, yres = (block_id["xstart"], block_id["ystart"], block_id["xres"], block_id["yres"])
+
+    # Reconstruct the coordinates from xi/yi/xres/yres (as it has to be a regular grid)
+    x_coords = np.arange(xs, xs + xres*arr_chunk.shape[0], xres)
+    y_coords = np.arange(ys, ys + yres*arr_chunk.shape[1], yres)
+
+    # TODO: Use scipy.map_coordinates for an equal grid as in Raster.interp_points?
+
+    # Interpolate to points
+    interp_chunk = interpn(points=(x_coords, y_coords), values=arr_chunk, xi=(interp_coords[0, :], interp_coords[1, :]))
+
+    # And return the interpolated array
+    return interp_chunk
+
+def delayed_interp_points(darr: da.Array,
+                          points: tuple[list[float], list[float]],
+                          resolution: tuple[float, float],
+                          method: Literal["nearest", "linear", "cubic", "quintic"] = "linear") -> np.ndarray:
+    """
+    Interpolate raster at point coordinates on out-of-memory chunks.
+
+    This function harnesses the fact that a raster is defined on a regular (or equal) grid, and it is therefore
+    faster than Xarray.interpn (especially for small sample sizes) and uses only a fraction of the memory usage.
+
+    :param darr: Input dask array.
+    :param points: Point(s) at which to interpolate raster value. If points fall outside of image, value
+            returned is nan. Shape should be (N,2).
+    :param resolution: Resolution of the raster (xres, yres).
+    :param method: Interpolation method, one of 'nearest', 'linear', 'cubic', or 'quintic'. For more information,
+            see scipy.ndimage.map_coordinates and scipy.interpolate.interpn. Default is linear.
+
+    :return: Array of raster value(s) for the given points.
+    """
+
+    # Convert input to 2D array
+    points = np.vstack((points[0], points[1]))
+
+    # Map depth of overlap required for each interpolation method
+    # TODO: Double-check this window somewhere in SciPy's documentation
+    map_depth = {"nearest": 1, "linear": 2, "cubic": 3, "quintic": 5}
+
+    # Expand dask array for overlapping computations
+    chunksize = darr.chunksize
+    expanded = da.overlap.overlap(darr, depth=map_depth[method], boundary=np.nan)
+
+    # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
+    # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
+    from dask.utils import cached_cumsum
+    starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
+    num_chunks = expanded.numblocks
+
+    # Get samples indices per blocks
+    ind_per_block = _get_interp_indices_per_block(points[0, :], points[1, :], starts, num_chunks, chunksize, resolution[0], resolution[1])
+
+    # Create a delayed object for each block, and flatten the blocks into a 1d shape
+    blocks = expanded.to_delayed().ravel()
+
+    # Build the block IDs by unravelling starting indexes for each block
+    indexes_xi, indexes_yi = np.unravel_index(np.arange(len(blocks)), shape=(num_chunks[0], num_chunks[1]))
+    block_ids = [{"xstart": starts[0][indexes_xi[i]] - map_depth[method],
+                  "ystart": starts[1][indexes_yi[i]] - map_depth[method],
+                  "xres": resolution[0],
+                  "yres": resolution[1]}
+                 for i in range(len(blocks))]
+
+    # Compute values delayed
+    list_interp = [_delayed_interp_points_block(blocks[i],
+                                                block_ids[i],
+                                                points[:, ind_per_block[i]])
+                                                for i, data_chunk in enumerate(blocks)
+                                                if len(ind_per_block[i]) > 0
+                   ]
+
+    # We define the expected output shape and dtype to simplify things for Dask
+    list_interp_delayed = [da.from_delayed(p, shape=(1, len(ind_per_block[i])), dtype=darr.dtype) for i, p in enumerate(list_interp)]
+    interp_points = np.concatenate(dask.compute(*list_interp_delayed), axis=0)
+
+    # Re-order per-block output points to match their original indices
+    indices = np.concatenate(ind_per_block).astype(int)
+    argsort = np.argsort(indices)
+    interp_points = np.array(interp_points)[argsort]
+
+    return interp_points
+
+# 3/ REPROJECT
+# Part of the code (defining a GeoGrid and GeoTiling classes) in inspired by
+# https://github.com/opendatacube/odc-geo/pull/88, modified to be more concise and rely only on Rasterio/GeoPandas
+# Could be submitted as a PR to Rioxarray? (but not sure the dependency to GeoPandas would work there)
+
+# We define a GeoGrid and GeoTiling class (which composes GeoGrid) to consistently deal with georeferenced footprints
+# of chunked grids
+class GeoGrid:
+    """
+    Georeferenced grid class.
+
+    Describes a georeferenced grid through a geotransform (one-sided bounds and resolution), shape and CRS.
+    """
+
+    def __init__(self, transform: rio.transform.Affine, shape: tuple[int, int], crs: rio.crs.CRS | None):
+
+        self._transform = transform
+        self._shape = shape
+        self._crs = crs
+
+    @property
+    def transform(self) -> rio.transform.Affine:
+        return self._transform
+
+    @property
+    def crs(self) -> rio.crs.CRS:
+        return self._crs
+
+    @property
+    def shape(self) -> tuple[int, int]:
+        return self._shape
+
+    @property
+    def height(self) -> int:
+        return self.shape[0]
+
+    @property
+    def width(self) -> int:
+        return self.shape[1]
+
+    @property
+    def res(self) -> tuple[int, int]:
+        return self.transform[0], abs(self.transform[4])
+
+    @property
+    def bounds(self) -> rio.coords.BoundingBox:
+        return rio.coords.BoundingBox(*rio.transform.array_bounds(self.height, self.width, self.transform))
+
+    def bounds_projected(self, crs: rio.crs.CRS = None) -> rio.coords.BoundingBox:
+        if crs is None:
+            crs = self.crs
+        return _get_bounds_projected(bounds=self.bounds, in_crs=self.crs, out_crs=crs)
+
+    @property
+    def footprint(self) -> gpd.GeoDataFrame:
+        return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=self.crs, densify_points=100)
+
+    def footprint_projected(self, crs: rio.crs.CRS = None):
+        if crs is None:
+            crs = self.crs
+        return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=crs, densify_points=100)
+
+    def shift(self,
+        xoff: float,
+        yoff: float,
+        distance_unit: Literal["georeferenced"] | Literal["pixel"] = "pixel"):
+        """Shift geogrid, not inplace."""
+
+        if distance_unit not in["georeferenced", "pixel"]:
+            raise ValueError("Argument 'distance_unit' should be either 'pixel' or 'georeferenced'.")
+
+        # Get transform
+        dx, b, xmin, d, dy, ymax = list(self.transform)[:6]
+
+        # Convert pixel offsets to georeferenced units
+        if distance_unit == "pixel":
+            xoff *= self.res[0]
+            yoff *= self.res[1]
+
+        shifted_transform = rio.transform.Affine(dx, b, xmin + xoff, d, dy, ymax + yoff)
+
+        return GeoGrid(transform=shifted_transform, crs=self.crs, shape=self.shape)
+
+
+def _get_block_ids_per_chunk(chunks: tuple[tuple[int, ...], tuple[int, ...]]) -> list[dict[str, int]]:
+    """Get location of chunks based on array shape and list of chunk sizes."""
+
+    # Get number of chunks
+    num_chunks = (len(chunks[0]), len(chunks[1]))
+
+    # Get robust list of chunk locations (using what is done in block_id of dask.array.map_blocks)
+    # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
+    from dask.utils import cached_cumsum
+    starts = [cached_cumsum(c, initial_zero=True) for c in chunks]
+    nb_blocks = num_chunks[0] * num_chunks[1]
+    ixi, iyi = np.unravel_index(np.arange(nb_blocks), shape=(num_chunks[0], num_chunks[1]))
+    block_ids = [{"num_block": i, "ys": starts[0][ixi[i]],  "xs": starts[1][iyi[i]],
+                  "ye": starts[0][ixi[i] + 1], "xe":  starts[1][iyi[i] + 1]}
+                 for i in range(nb_blocks)]
+
+    return block_ids
+
+class ChunkedGeoGrid:
+    """
+    Chunked georeferenced grid class.
+
+    Associates a georeferenced grid to chunks (possibly of varying sizes).
+    """
+
+    def __init__(self, grid: GeoGrid, chunks: tuple[tuple[int, ...], tuple[int, ...]]):
+
+        self._grid = grid
+        self._chunks = chunks
+
+    @property
+    def grid(self) -> GeoGrid:
+        return self._grid
+
+    @property
+    def chunks(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return self._chunks
+
+    def get_block_locations(self) -> list[dict[str, int]]:
+        """Get block locations in 2D: xstart, xend, ystart, yend."""
+        return _get_block_ids_per_chunk(self._chunks)
+
+    def get_blocks_as_geogrids(self) -> list[GeoGrid]:
+        """Get blocks as geogrids with updated transform/shape."""
+
+        block_ids = self.get_block_locations()
+
+        list_geogrids = []
+        for bid in block_ids:
+            # We get the block size
+            block_shape = (bid["ye"] - bid["ys"], bid["xe"] - bid["xs"])
+            # Build a temporary geogrid with the same transform as the full grid
+            geogrid_tmp = GeoGrid(transform=self.grid.transform, crs=self.grid.crs, shape=block_shape)
+            # And shift it to the right location (X is positive in index direction, Y is negative)
+            geogrid_block = geogrid_tmp.shift(xoff=bid["xs"], yoff=-bid["ys"])
+            list_geogrids.append(geogrid_block)
+
+        return list_geogrids
+
+    def get_block_footprints(self, crs: rio.crs.CRS = None) -> gpd.GeoDataFrame:
+        """Get block projected footprints as a single geodataframe."""
+
+        geogrids = self.get_blocks_as_geogrids()
+        footprints = [gg.footprint_projected(crs=crs) if crs is not None else gg.footprint for gg in geogrids]
+
+        return pd.concat(footprints)
+
+
+def _chunks2d_from_chunksizes_shape(chunksizes: tuple[int, int], shape: tuple[int, int]):
+    """Get tuples of chunk sizes for X/Y dimensions based on chunksizes and array shape."""
+
+    # Chunksize is fixed, except for the last chunk depending on the shape
+    chunks_y = tuple(min(chunksizes[0], shape[0] - i*chunksizes[0],) for i in range(int(np.ceil(shape[0]/chunksizes[0]))))
+    chunks_x = tuple(min(chunksizes[1], shape[1] - i*chunksizes[1],) for i in range(int(np.ceil(shape[1]/chunksizes[1]))))
+
+    return chunks_y, chunks_x
+
+def _combined_blocks_shape_transform(sub_block_ids: list[dict[str, int]], src_geogrid: GeoGrid) -> tuple[dict[str, Any], list[dict[str, int]]]:
+    """Derive combined shape and transform from a subset of several blocks (for source input during reprojection)."""
+
+    # Get combined shape by taking min of X/Y starting indices, max of X/Y ending indices
+    all_xs, all_ys, all_xe, all_ye = ([b[s] for b in sub_block_ids] for s in ["xs", "ys", "xe", "ye"])
+    minmaxs = {"min_xs": np.min(all_xs), "max_xe": np.max(all_xe), "min_ys": np.min(all_ys), "max_ye": np.max(all_ye)}
+    combined_shape = (minmaxs["max_ye"] - minmaxs["min_ys"], minmaxs["max_xe"] - minmaxs["min_xs"])
+
+    # Shift source transform with start indexes to get the one for combined block location
+    combined_transform = src_geogrid.shift(xoff=minmaxs["min_xs"], yoff=-minmaxs["min_ys"]).transform
+
+    # Compute relative block indexes that will be needed to reconstruct a square array in the delayed function,
+    # by subtracting the minimum starting indices in X/Y
+    relative_block_indexes = [{"r"+s1+s2: b[s1+s2] - minmaxs["min_"+s1+"s"] for s1 in ["x", "y"] for s2 in ["s", "e"]} for b in sub_block_ids]
+
+    combined_meta = {"src_shape": combined_shape, "src_transform": tuple(combined_transform)}
+
+    return combined_meta, relative_block_indexes
+
+
+@dask.delayed
+def _delayed_reproject_per_block(*src_arrs: tuple[np.ndarray], block_ids: list[dict[str, int]], combined_meta: dict[str, Any], **kwargs: Any) -> np.ndarray:
+    """
+    Delayed reprojection per destination block (need to rebuild a square source array combined from intersecting blocks).
+    """
+
+    # If no source chunk intersects, we return a chunk of destination nodata values
+    if len(src_arrs) == 0:
+        dst_arr = np.zeros(combined_meta["dst_shape"], dtype=np.dtype("float64"))
+        dst_arr[:] = kwargs["dst_nodata"]
+        return dst_arr
+
+    # First, we build an empty array with the combined shape, only with nodata values
+    comb_src_arr = np.ones((combined_meta["src_shape"]), dtype=src_arrs[0].dtype)
+    comb_src_arr[:] = kwargs["src_nodata"]
+
+    # Then fill it with the source chunks values
+    for i, arr in enumerate(src_arrs):
+        bid = block_ids[i]
+        comb_src_arr[bid["rys"]:bid["rye"], bid["rxs"]:bid["rxe"]] = arr
+
+    # Now, we can simply call Rasterio!
+
+    # We build the combined transform from tuple
+    src_transform = rio.transform.Affine(*combined_meta["src_transform"])
+    dst_transform = rio.transform.Affine(*combined_meta["dst_transform"])
+
+    # Reproject
+    dst_arr = np.zeros(combined_meta["dst_shape"], dtype=comb_src_arr.dtype)
+
+    _ = rio.warp.reproject(
+            comb_src_arr,
+            dst_arr,
+            src_transform=src_transform,
+            src_crs=kwargs["src_crs"],
+            dst_transform=dst_transform,
+            dst_crs=kwargs["dst_crs"],
+            resampling=kwargs["resampling"],
+            src_nodata=kwargs["src_nodata"],
+            dst_nodata=kwargs["dst_nodata"],
+        )
+
+    return dst_arr
+
+
+def delayed_reproject(darr: da.Array,
+                      src_transform: rio.transform.Affine,
+                      src_crs: rio.crs.CRS,
+                      dst_transform: rio.transform.Affine,
+                      dst_shape: tuple[int, int],
+                      dst_crs: rio.crs.CRS,
+                      resampling: rio.enums.Resampling,
+                      src_nodata: int | float = None,
+                      dst_nodata: int | float = None,
+                      dst_chunksizes: tuple[int, int] = None,
+                      **kwargs: Any) -> da.Array:
+    """
+    Reproject georeferenced raster on out-of-memory chunks.
+
+    Each chunk of the destination array is mapped to one or several intersecting chunks of the source array, and
+    reprojection is performed using rio.warp.reproject for each mapping.
+
+    Part of the code is inspired by https://github.com/opendatacube/odc-geo/pull/88.
+
+    :param darr: Input dask array for source raster.
+    :param src_transform: Geotransform of source raster.
+    :param src_crs: Coordinate reference system of source raster.
+    :param dst_transform: Geotransform of destination raster.
+    :param dst_shape: Shape of destination raster.
+    :param dst_crs: Coordinate reference system of destination raster.
+    :param resampling: Resampling method.
+    :param src_nodata: Nodata value of source raster.
+    :param dst_nodata: Nodata value of destination raster.
+    :param dst_chunksizes: Chunksizes for destination raster.
+    :param kwargs: Other arguments to pass to rio.warp.reproject().
+
+    :return: Dask array of reprojected raster.
+    """
+
+    # 1/ Define source and destination chunked georeferenced grid through simple classes storing CRS/transform/shape,
+    # which allow to consistently derive shape/transform for each block and their CRS-projected footprints
+
+    # Define georeferenced grids for source/destination array
+    src_geogrid = GeoGrid(transform=src_transform, shape=darr.shape, crs=src_crs)
+    dst_geogrid = GeoGrid(transform=dst_transform, shape=dst_shape, crs=dst_crs)
+
+    # Add the chunking
+    # For source, we can use the .chunks attribute
+    src_chunks = darr.chunks
+    src_geotiling = ChunkedGeoGrid(grid=src_geogrid, chunks=src_chunks)
+
+    # For destination, we need to create the chunks based on destination chunksizes
+    dst_chunks = _chunks2d_from_chunksizes_shape(chunksizes=dst_chunksizes, shape=dst_shape)
+    dst_geotiling = ChunkedGeoGrid(grid=dst_geogrid, chunks=dst_chunks)
+
+    # 2/ Get footprints of tiles in CRS of destination array, with a buffer of 2 pixels for destination ones to ensure
+    # overlap, then map indexes of source blocks that intersect a given destination block
+    src_footprints = src_geotiling.get_block_footprints(crs=dst_crs)
+    dst_footprints = dst_geotiling.get_block_footprints().buffer(2 * max(dst_geogrid.res))
+    dest2source = [np.where(dst.intersects(src_footprints).geometry.values)[0] for dst in dst_footprints]
+
+    # 3/ To reconstruct a square source array during chunked reprojection, we need to derive the combined shape and
+    # transform of each tuples of source blocks
+    src_block_ids = np.array(src_geotiling.get_block_locations())
+    meta_params = [_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
+                   if len(sbid) > 0 else ({}, []) for sbid in dest2source]
+    # We also add the output transform/shape for this destination chunk in the combined meta
+    # (those are the only two that are chunk-specific)
+    dst_block_geogrids = dst_geotiling.get_blocks_as_geogrids()
+    for i, (c, _) in enumerate(meta_params):
+        c.update({"dst_shape": dst_block_geogrids[i].shape, "dst_transform": tuple(dst_block_geogrids[i].transform)})
+
+    # 4/ Call a delayed function that uses rio.warp to reproject the combined source block(s) to each destination block
+
+    # Add fixed arguments to keywords
+    kwargs.update({"src_nodata": src_nodata, "dst_nodata": dst_nodata, "resampling": resampling,
+                   "src_crs": src_crs, "dst_crs": dst_crs})
+
+    # Create a delayed object for each block, and flatten the blocks into a 1d shape
+    blocks = darr.to_delayed().ravel()
+    # Run the delayed reprojection, looping for each destination block
+    list_reproj = [_delayed_reproject_per_block(*blocks[dest2source[i]],
+                                                block_ids=meta_params[i][1],
+                                                combined_meta=meta_params[i][0],
+                                                **kwargs)
+                   for i in range(len(dest2source))
+                   ]
+
+    # We define the expected output shape and dtype to simplify things for Dask
+    list_reproj_delayed = [da.from_delayed(r, shape=dst_block_geogrids[i].shape, dtype=darr.dtype) for i, r in
+                           enumerate(list_reproj)]
+
+    # Array comes out as flat blocks x chunksize0 (varying) x chunksize1 (varying), so we can't reshape directly
+    # We need to unravel the flattened blocks indices to align X/Y, then concatenate all columns, then rows
+    indexes_xi, indexes_yi = np.unravel_index(np.arange(len(dest2source)), shape=(len(dst_chunks[0]), len(dst_chunks[1])))
+
+    lists_columns = [[l for i, l in enumerate(list_reproj_delayed) if j == indexes_xi[i]] for j in range(len(dst_chunks[0]))]
+    concat_columns = [da.concatenate(c, axis=1) for c in lists_columns]
+    concat_all = da.concatenate(concat_columns, axis=0)
+
+    return concat_all
diff --git a/tests/test_array.py b/tests/test_raster/test_array.py
similarity index 100%
rename from tests/test_array.py
rename to tests/test_raster/test_array.py
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
new file mode 100644
index 00000000..06972759
--- /dev/null
+++ b/tests/test_raster/test_delayed.py
@@ -0,0 +1,154 @@
+"""Tests for dask functions (temporarily in xDEM, might move to GeoUtils)."""
+import os
+
+import numpy as np
+import pytest
+import xarray as xr
+import dask.array as da
+from pyproj import CRS
+import rasterio as rio
+import dask
+
+from geoutils.examples import _EXAMPLES_DIRECTORY
+from geoutils.raster.delayed import delayed_subsample, delayed_interp_points, delayed_reproject
+
+class TestDelayed:
+
+    # Create test file (replace with data in time?)
+    fn_tmp = os.path.join(_EXAMPLES_DIRECTORY, "test.nc")
+    if not os.path.exists(fn_tmp):
+        data = np.random.normal(size=400000000).reshape(20000, 20000)
+        data_arr = xr.DataArray(data=data, dims=["x", "y"])
+        ds = xr.Dataset(data_vars={"test": data_arr})
+        encoding_kwargs = {"test": {"chunksizes": (100, 100)}}
+        ds.to_netcdf(fn_tmp, encoding=encoding_kwargs)
+        del ds, da, data
+
+    # Chunk size in memory
+    chunksize = 500
+
+    @pytest.mark.parametrize("subsample_size", [2, 100, 100000])
+    def test_delayed_subsample(self, subsample_size: int):
+        """Checks for delayed subsampling function."""
+
+        # Open dataset with chunks
+        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
+        darr = ds["test"].data
+
+        # Derive subsample from delayed function
+        sub = delayed_subsample(darr, subsample=subsample_size, random_state=42)
+
+        # The subsample should have exactly the prescribed length, with only valid values
+        assert len(sub) == subsample_size
+        assert all(np.isfinite(sub))
+
+        # To verify the sampling works correctly, we can get its subsample indices with the argument return_indices
+        # And compare to the same subsample with vindex (now that we know the coordinates of valid values sampled)
+        indices = delayed_subsample(darr, subsample=subsample_size, random_state=42, return_indices=True)
+        sub2 = np.array(darr.vindex[indices[0], indices[1]])
+        assert np.array_equal(sub, sub2)
+
+    @pytest.mark.parametrize("ninterp", [2, 100, 100000])
+    def test_delayed_interp_points(self, ninterp: int):
+        """Checks for delayed interpolate points function."""
+
+        # Open dataset with chunks
+        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
+        darr = ds["test"].data
+
+        # Create random point coordinates to interpolate
+        rng = np.random.default_rng(seed=42)
+        interp_x = rng.choice(ds.x.size, ninterp) + rng.random(ninterp)
+        interp_y = rng.choice(ds.y.size, ninterp) + rng.random(ninterp)
+
+        # Interpolate with delayed function
+        interp1 = delayed_interp_points(darr=darr, points=(interp_x, interp_y), resolution=(1, 1))
+
+        # Interpolate directly with Xarray and compare results are the same
+        xx = xr.DataArray(interp_x, dims='z', name='x')
+        yy = xr.DataArray(interp_y, dims='z', name='y')
+        interp2 = ds.test.interp(x=xx, y=yy)
+        interp2.compute()
+        interp2 = np.array(interp2.values)
+
+        assert np.array_equal(interp1, interp2, equal_nan=True)
+
+    # Let's check a lot of different scenarios
+    def test_delayed_reproject(self):
+
+        # Open dataset with chunks
+        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
+        darr = ds["test"].data
+
+        dask.config.set(scheduler='single-threaded')
+
+        src_shape = darr.shape
+
+        # src_shape = (150, 100)
+        # src_chunksizes = (25, 10)
+        # rng = da.random.default_rng(seed=42)
+        # darr = rng.normal(size=src_shape, chunks=src_chunksizes)
+        # darr = da.ones(src_shape, chunks=src_chunksizes)
+        src_crs = CRS(4326)
+        src_transform = rio.transform.from_bounds(0, 0, 5, 5, src_shape[0], src_shape[1])
+
+        dst_shape = (77, 50)
+        dst_crs = CRS(32631)
+        dst_chunksizes = (7, 5)
+
+        # Build an intersecting dst_transform that is not aligned
+        src_res = (src_transform[0], abs(src_transform[4]))
+        bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(src_shape[0], src_shape[1], src_transform))
+        # First, an aligned transform in the new CRS that allows to get
+        # temporary new bounds and resolution in the units of the new CRS
+        tmp_transform = rio.warp.calculate_default_transform(
+            src_crs,
+            dst_crs,
+            src_shape[1],
+            src_shape[0],
+            left=bounds.left,
+            right=bounds.right,
+            top=bounds.top,
+            bottom=bounds.bottom,
+            dst_width=dst_shape[1],
+            dst_height=dst_shape[0],
+        )[0]
+        tmp_res = (tmp_transform[0], abs(tmp_transform[4]))
+        tmp_bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(dst_shape[0], dst_shape[1], tmp_transform))
+        # Now we modify the destination grid by changing bounds by a bit + the resolution
+        dst_transform = rio.transform.from_origin(tmp_bounds.left + 100*tmp_res[0], tmp_bounds.top + 150*tmp_res[0],
+                                                  tmp_res[0]*2.5, tmp_res[1]*0.7)
+
+        # Other arguments
+        src_nodata = -9999
+        dst_nodata = 99999
+        resampling = rio.enums.Resampling.bilinear
+
+        # Run delayed reproject
+        reproj_arr = delayed_reproject(darr, src_transform=src_transform, src_crs=src_crs, dst_transform=dst_transform,
+                                       dst_crs=dst_crs, dst_shape=dst_shape, src_nodata=src_nodata, dst_nodata=dst_nodata,
+                                       resampling=resampling, dst_chunksizes=dst_chunksizes)
+
+        # Save file out-of-memory
+        # TODO: Would need to wrap the georef data in the netCDF, but not needed to test this
+        fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, "test_reproj.nc")
+        data_arr = xr.DataArray(data=reproj_arr, dims=["x", "y"])
+        ds_out = xr.Dataset(data_vars={"test_reproj": data_arr})
+        write_delayed = ds_out.to_netcdf(fn_tmp_out, compute=False)
+        write_delayed.compute()
+
+        # Load in-memory and compare with a direct reproject
+        dst_arr = np.zeros(dst_shape)
+        _ = rio.warp.reproject(
+            np.array(darr),
+            dst_arr,
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_transform=dst_transform,
+            dst_crs=dst_crs,
+            resampling=resampling,
+            src_nodata=src_nodata,
+            dst_nodata=dst_nodata,
+        )
+
+        assert np.allclose(reproj_arr.compute(), dst_arr, atol=0.02)
diff --git a/tests/test_multiraster.py b/tests/test_raster/test_multiraster.py
similarity index 100%
rename from tests/test_multiraster.py
rename to tests/test_raster/test_multiraster.py
diff --git a/tests/test_raster.py b/tests/test_raster/test_raster.py
similarity index 100%
rename from tests/test_raster.py
rename to tests/test_raster/test_raster.py
diff --git a/tests/test_sampling.py b/tests/test_raster/test_sampling.py
similarity index 100%
rename from tests/test_sampling.py
rename to tests/test_raster/test_sampling.py
diff --git a/tests/test_satimg.py b/tests/test_raster/test_satimg.py
similarity index 100%
rename from tests/test_satimg.py
rename to tests/test_raster/test_satimg.py

From c2cdf8b57c741b977ed199105d388170a13eab39 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Wed, 24 Apr 2024 16:42:44 -0800
Subject: [PATCH 02/22] Add dask to environment

---
 dev-environment.yml | 1 +
 environment.yml     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dev-environment.yml b/dev-environment.yml
index c0720f47..50f65391 100644
--- a/dev-environment.yml
+++ b/dev-environment.yml
@@ -12,6 +12,7 @@ dependencies:
   - scipy=1.*
   - tqdm
   - xarray
+  - dask
   - rioxarray=0.*
 
   # Development-specific, to mirror manually in setup.cfg [options.extras_require].
diff --git a/environment.yml b/environment.yml
index 5f2c5d50..92daeaba 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,4 +12,5 @@ dependencies:
   - scipy=1.*
   - tqdm
   - xarray
+  - dask
   - rioxarray=0.*

From 8d96c61ba624e4bb971b20bf50cae475a315c686 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Wed, 24 Apr 2024 17:01:06 -0800
Subject: [PATCH 03/22] Fix tests

---
 tests/test_raster/test_delayed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 06972759..975b1033 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -22,7 +22,7 @@ class TestDelayed:
         ds = xr.Dataset(data_vars={"test": data_arr})
         encoding_kwargs = {"test": {"chunksizes": (100, 100)}}
         ds.to_netcdf(fn_tmp, encoding=encoding_kwargs)
-        del ds, da, data
+        del ds, data_arr, data
 
     # Chunk size in memory
     chunksize = 500

From 1d078a2aa3e19bced56f138a837c46c0f8319458 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Wed, 24 Apr 2024 17:03:11 -0800
Subject: [PATCH 04/22] Linting all but mypy for now

---
 geoutils/raster/delayed.py        | 276 ++++++++++++++++++++----------
 requirements.txt                  |   1 +
 tests/test_raster/test_delayed.py |  39 +++--
 3 files changed, 208 insertions(+), 108 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 38fd114e..6fc8b668 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -4,25 +4,25 @@
 import warnings
 from typing import Any, Literal
 
-import numpy as np
-import dask.delayed
 import dask.array as da
-import pandas as pd
+import dask.delayed
 import geopandas as gpd
-
+import numpy as np
+import pandas as pd
 import rasterio as rio
 from scipy.interpolate import interpn
 
-from geoutils.projtools import _get_footprint_projected, _get_bounds_projected
+from geoutils.projtools import _get_bounds_projected, _get_footprint_projected
 
 # 1/ SUBSAMPLING
 # Getting an exact subsample size out-of-memory only for valid values is not supported directly by Dask/Xarray
 
-# It is not trivial because we don't know where valid values will be in advance, and because of ragged output (varying 
+# It is not trivial because we don't know where valid values will be in advance, and because of ragged output (varying
 # output length considerations), which prevents from using high-level functions with good efficiency
-# We thus follow https://blog.dask.org/2021/07/02/ragged-output (the dask.array.map_blocks solution has a larger RAM 
+# We thus follow https://blog.dask.org/2021/07/02/ragged-output (the dask.array.map_blocks solution has a larger RAM
 # usage by having to drop an axis and re-chunk along 1D of the 2D array, so we use the dask.delayed solution instead)
 
+
 def _random_state_from_user_input(random_state: np.random.RandomState | int | None = None) -> np.random.RandomState:
     """Define random state based on varied user input."""
 
@@ -37,6 +37,7 @@ def _random_state_from_user_input(random_state: np.random.RandomState | int | No
 
     return rnd
 
+
 def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids: int) -> int:
     """Get subsample size based on a user input of either integer size or fraction of the number of valid points."""
 
@@ -48,14 +49,20 @@ def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids:
         # Use the number of valid points if larger than subsample asked by user
         npoints = min(int(subsample), total_nb_valids)
         if subsample > total_nb_valids:
-            warnings.warn(f"Subsample value of {subsample} is larger than the number of valid pixels of {total_nb_valids},"
-                          f"using all valid pixels as a subsample.", category=UserWarning)
+            warnings.warn(
+                f"Subsample value of {subsample} is larger than the number of valid pixels of {total_nb_valids},"
+                f"using all valid pixels as a subsample.",
+                category=UserWarning,
+            )
     else:
         raise ValueError("Subsample must be > 0.")
 
     return npoints
 
-def _get_indices_block_per_subsample(xxs: np.ndarray, num_chunks: tuple[int, int], nb_valids_per_block: list[int]) -> list[list[int]]:
+
+def _get_indices_block_per_subsample(
+    xxs: np.ndarray, num_chunks: tuple[int, int], nb_valids_per_block: list[int]
+) -> list[list[int]]:
     """
     Get list of 1D valid subsample indices relative to the block for each block.
 
@@ -78,7 +85,7 @@ def _get_indices_block_per_subsample(xxs: np.ndarray, num_chunks: tuple[int, int
     xxs = np.sort(xxs)
 
     # We define a list of indices per block
-    relative_ind_per_block = [[] for _ in range((num_chunks[0] * num_chunks[1]))]
+    relative_ind_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
     k = 0  # K is the block number
     for x in xxs:
 
@@ -93,11 +100,13 @@ def _get_indices_block_per_subsample(xxs: np.ndarray, num_chunks: tuple[int, int
 
     return relative_ind_per_block
 
+
 @dask.delayed
 def _delayed_nb_valids(arr_chunk: np.ndarray) -> np.ndarray:
     """Count number of valid values per block."""
     return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))
 
+
 @dask.delayed
 def _delayed_subsample_block(arr_chunk: np.ndarray, subsample_indices: np.ndarray) -> np.ndarray:
     """Subsample the valid values at the corresponding 1D valid indices per block."""
@@ -106,8 +115,11 @@ def _delayed_subsample_block(arr_chunk: np.ndarray, subsample_indices: np.ndarra
 
     return s_chunk
 
+
 @dask.delayed
-def _delayed_subsample_indices_block(arr_chunk: np.ndarray, subsample_indices: np.ndarray, block_id: dict[str, Any]) -> np.ndarray:
+def _delayed_subsample_indices_block(
+    arr_chunk: np.ndarray, subsample_indices: np.ndarray, block_id: dict[str, Any]
+) -> np.ndarray:
     """Return 2D indices from the subsampled 1D valid indices per block."""
 
     # Unravel indices of valid data to the shape of the block
@@ -119,10 +131,13 @@ def _delayed_subsample_indices_block(arr_chunk: np.ndarray, subsample_indices: n
 
     return np.hstack((ix, iy))
 
-def delayed_subsample(darr: da.Array,
-                      subsample: int | float = 1,
-                      return_indices: bool = False,
-                      random_state: np.random.RandomState | int | None = None) -> np.ndarray:
+
+def delayed_subsample(
+    darr: da.Array,
+    subsample: int | float = 1,
+    return_indices: bool = False,
+    random_state: np.random.RandomState | int | None = None,
+) -> np.ndarray:
     """
     Subsample a raster at valid values on out-of-memory chunks.
 
@@ -155,7 +170,9 @@ def delayed_subsample(darr: da.Array,
 
     # Compute number of valid points for each block out-of-memory
     blocks = darr.to_delayed().ravel()
-    list_delayed_valids = [da.from_delayed(_delayed_nb_valids(b), shape=(1, 1), dtype=np.dtype("int32")) for b in blocks]
+    list_delayed_valids = [
+        da.from_delayed(_delayed_nb_valids(b), shape=(1, 1), dtype=np.dtype("int32")) for b in blocks
+    ]
     nb_valids_per_block = np.concatenate([dask.compute(*list_delayed_valids)])
 
     # Sum to get total number of valid points
@@ -168,7 +185,9 @@ def delayed_subsample(darr: da.Array,
     indices_1d = rnd.choice(total_nb_valids, subsample_size, replace=False)
 
     # Sort which indexes belong to which chunk
-    ind_per_block = _get_indices_block_per_subsample(indices_1d, num_chunks=darr.numblocks, nb_valids_per_block=nb_valids_per_block)
+    ind_per_block = _get_indices_block_per_subsample(
+        indices_1d, num_chunks=darr.numblocks, nb_valids_per_block=nb_valids_per_block
+    )
 
     # To just get the subsample without indices
     if not return_indices:
@@ -179,8 +198,9 @@ def delayed_subsample(darr: da.Array,
             if len(ind_per_block[i]) > 0
         ]
         # Cast output to the right expected dtype and length, then compute and concatenate
-        list_subsamples_delayed = [da.from_delayed(s, shape=(nb_valids_per_block[i]), dtype=darr.dtype)
-                                   for i, s in enumerate(list_subsamples)]
+        list_subsamples_delayed = [
+            da.from_delayed(s, shape=(nb_valids_per_block[i]), dtype=darr.dtype) for i, s in enumerate(list_subsamples)
+        ]
         subsamples = np.concatenate(dask.compute(*list_subsamples_delayed), axis=0)
 
         return subsamples
@@ -190,11 +210,14 @@ def delayed_subsample(darr: da.Array,
         # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
         # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
         from dask.utils import cached_cumsum
+
         starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
         num_chunks = darr.numblocks
         # Get the starts per 1D block ID by unravelling starting indexes for each block
         indexes_xi, indexes_yi = np.unravel_index(np.arange(len(blocks)), shape=(num_chunks[0], num_chunks[1]))
-        block_ids = [{"xstart": starts[0][indexes_xi[i]], "ystart": starts[1][indexes_yi[i]]} for i in range(len(blocks))]
+        block_ids = [
+            {"xstart": starts[0][indexes_xi[i]], "ystart": starts[1][indexes_yi[i]]} for i in range(len(blocks))
+        ]
 
         # Task delayed subsample indices to be computed for each block, skipping blocks with no values to sample
         list_subsample_indices = [
@@ -203,8 +226,10 @@ def delayed_subsample(darr: da.Array,
             if len(ind_per_block[i]) > 0
         ]
         # Cast output to the right expected dtype and length, then compute and concatenate
-        list_subsamples_indices_delayed = [da.from_delayed(s, shape=(2, len(ind_per_block[i])), dtype=np.dtype("int32"))
-                                           for i, s in enumerate(list_subsample_indices)]
+        list_subsamples_indices_delayed = [
+            da.from_delayed(s, shape=(2, len(ind_per_block[i])), dtype=np.dtype("int32"))
+            for i, s in enumerate(list_subsample_indices)
+        ]
         indices = np.concatenate(dask.compute(*list_subsamples_indices_delayed), axis=0)
 
         return indices[:, 0], indices[:, 1]
@@ -219,6 +244,7 @@ def delayed_subsample(darr: da.Array,
 
 # Code structure inspired by https://blog.dask.org/2021/07/02/ragged-output and the "block_id" in map_blocks
 
+
 def _get_interp_indices_per_block(interp_x, interp_y, starts, num_chunks, chunksize, xres, yres):
     """Map blocks where each pair of interpolation coordinates will have to be computed."""
 
@@ -229,17 +255,21 @@ def _get_interp_indices_per_block(interp_x, interp_y, starts, num_chunks, chunks
     #  (as it uses array instead of nested lists, and nested lists grow in RAM very fast)
 
     # We use one bucket per block, assuming a flattened blocks shape
-    ind_per_block = [[] for _ in range((num_chunks[0] * num_chunks[1]))]
+    ind_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
     for i, (x, y) in enumerate(zip(interp_x, interp_y)):
         # Because it is a regular grid, we know exactly in which block ID the coordinate will fall
-        block_i_1d = int((x - starts[0][0]) / (xres * chunksize[0])) * num_chunks[1] + int((y - starts[1][0])/ (yres * chunksize[1]))
+        block_i_1d = int((x - starts[0][0]) / (xres * chunksize[0])) * num_chunks[1] + int(
+            (y - starts[1][0]) / (yres * chunksize[1])
+        )
         ind_per_block[block_i_1d].append(i)
 
     return ind_per_block
 
 
 @dask.delayed
-def _delayed_interp_points_block(arr_chunk: np.ndarray, block_id: dict[str, Any], interp_coords: np.ndarray) -> np.ndarray:
+def _delayed_interp_points_block(
+    arr_chunk: np.ndarray, block_id: dict[str, Any], interp_coords: np.ndarray
+) -> np.ndarray:
     """
     Interpolate block in 2D out-of-memory for a regular or equal grid.
     """
@@ -248,8 +278,8 @@ def _delayed_interp_points_block(arr_chunk: np.ndarray, block_id: dict[str, Any]
     xs, ys, xres, yres = (block_id["xstart"], block_id["ystart"], block_id["xres"], block_id["yres"])
 
     # Reconstruct the coordinates from xi/yi/xres/yres (as it has to be a regular grid)
-    x_coords = np.arange(xs, xs + xres*arr_chunk.shape[0], xres)
-    y_coords = np.arange(ys, ys + yres*arr_chunk.shape[1], yres)
+    x_coords = np.arange(xs, xs + xres * arr_chunk.shape[0], xres)
+    y_coords = np.arange(ys, ys + yres * arr_chunk.shape[1], yres)
 
     # TODO: Use scipy.map_coordinates for an equal grid as in Raster.interp_points?
 
@@ -259,10 +289,13 @@ def _delayed_interp_points_block(arr_chunk: np.ndarray, block_id: dict[str, Any]
     # And return the interpolated array
     return interp_chunk
 
-def delayed_interp_points(darr: da.Array,
-                          points: tuple[list[float], list[float]],
-                          resolution: tuple[float, float],
-                          method: Literal["nearest", "linear", "cubic", "quintic"] = "linear") -> np.ndarray:
+
+def delayed_interp_points(
+    darr: da.Array,
+    points: tuple[list[float], list[float]],
+    resolution: tuple[float, float],
+    method: Literal["nearest", "linear", "cubic", "quintic"] = "linear",
+) -> np.ndarray:
     """
     Interpolate raster at point coordinates on out-of-memory chunks.
 
@@ -293,33 +326,41 @@ def delayed_interp_points(darr: da.Array,
     # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
     # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
     from dask.utils import cached_cumsum
+
     starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
     num_chunks = expanded.numblocks
 
     # Get samples indices per blocks
-    ind_per_block = _get_interp_indices_per_block(points[0, :], points[1, :], starts, num_chunks, chunksize, resolution[0], resolution[1])
+    ind_per_block = _get_interp_indices_per_block(
+        points[0, :], points[1, :], starts, num_chunks, chunksize, resolution[0], resolution[1]
+    )
 
     # Create a delayed object for each block, and flatten the blocks into a 1d shape
     blocks = expanded.to_delayed().ravel()
 
     # Build the block IDs by unravelling starting indexes for each block
     indexes_xi, indexes_yi = np.unravel_index(np.arange(len(blocks)), shape=(num_chunks[0], num_chunks[1]))
-    block_ids = [{"xstart": starts[0][indexes_xi[i]] - map_depth[method],
-                  "ystart": starts[1][indexes_yi[i]] - map_depth[method],
-                  "xres": resolution[0],
-                  "yres": resolution[1]}
-                 for i in range(len(blocks))]
+    block_ids = [
+        {
+            "xstart": starts[0][indexes_xi[i]] - map_depth[method],
+            "ystart": starts[1][indexes_yi[i]] - map_depth[method],
+            "xres": resolution[0],
+            "yres": resolution[1],
+        }
+        for i in range(len(blocks))
+    ]
 
     # Compute values delayed
-    list_interp = [_delayed_interp_points_block(blocks[i],
-                                                block_ids[i],
-                                                points[:, ind_per_block[i]])
-                                                for i, data_chunk in enumerate(blocks)
-                                                if len(ind_per_block[i]) > 0
-                   ]
+    list_interp = [
+        _delayed_interp_points_block(blocks[i], block_ids[i], points[:, ind_per_block[i]])
+        for i, data_chunk in enumerate(blocks)
+        if len(ind_per_block[i]) > 0
+    ]
 
     # We define the expected output shape and dtype to simplify things for Dask
-    list_interp_delayed = [da.from_delayed(p, shape=(1, len(ind_per_block[i])), dtype=darr.dtype) for i, p in enumerate(list_interp)]
+    list_interp_delayed = [
+        da.from_delayed(p, shape=(1, len(ind_per_block[i])), dtype=darr.dtype) for i, p in enumerate(list_interp)
+    ]
     interp_points = np.concatenate(dask.compute(*list_interp_delayed), axis=0)
 
     # Re-order per-block output points to match their original indices
@@ -329,6 +370,7 @@ def delayed_interp_points(darr: da.Array,
 
     return interp_points
 
+
 # 3/ REPROJECT
 # Part of the code (defining a GeoGrid and GeoTiling classes) in inspired by
 # https://github.com/opendatacube/odc-geo/pull/88, modified to be more concise and rely only on Rasterio/GeoPandas
@@ -391,13 +433,10 @@ def footprint_projected(self, crs: rio.crs.CRS = None):
             crs = self.crs
         return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=crs, densify_points=100)
 
-    def shift(self,
-        xoff: float,
-        yoff: float,
-        distance_unit: Literal["georeferenced"] | Literal["pixel"] = "pixel"):
+    def shift(self, xoff: float, yoff: float, distance_unit: Literal["georeferenced"] | Literal["pixel"] = "pixel"):
         """Shift geogrid, not inplace."""
 
-        if distance_unit not in["georeferenced", "pixel"]:
+        if distance_unit not in ["georeferenced", "pixel"]:
             raise ValueError("Argument 'distance_unit' should be either 'pixel' or 'georeferenced'.")
 
         # Get transform
@@ -422,15 +461,24 @@ def _get_block_ids_per_chunk(chunks: tuple[tuple[int, ...], tuple[int, ...]]) ->
     # Get robust list of chunk locations (using what is done in block_id of dask.array.map_blocks)
     # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
     from dask.utils import cached_cumsum
+
     starts = [cached_cumsum(c, initial_zero=True) for c in chunks]
     nb_blocks = num_chunks[0] * num_chunks[1]
     ixi, iyi = np.unravel_index(np.arange(nb_blocks), shape=(num_chunks[0], num_chunks[1]))
-    block_ids = [{"num_block": i, "ys": starts[0][ixi[i]],  "xs": starts[1][iyi[i]],
-                  "ye": starts[0][ixi[i] + 1], "xe":  starts[1][iyi[i] + 1]}
-                 for i in range(nb_blocks)]
+    block_ids = [
+        {
+            "num_block": i,
+            "ys": starts[0][ixi[i]],
+            "xs": starts[1][iyi[i]],
+            "ye": starts[0][ixi[i] + 1],
+            "xe": starts[1][iyi[i] + 1],
+        }
+        for i in range(nb_blocks)
+    ]
 
     return block_ids
 
+
 class ChunkedGeoGrid:
     """
     Chunked georeferenced grid class.
@@ -485,12 +533,27 @@ def _chunks2d_from_chunksizes_shape(chunksizes: tuple[int, int], shape: tuple[in
     """Get tuples of chunk sizes for X/Y dimensions based on chunksizes and array shape."""
 
     # Chunksize is fixed, except for the last chunk depending on the shape
-    chunks_y = tuple(min(chunksizes[0], shape[0] - i*chunksizes[0],) for i in range(int(np.ceil(shape[0]/chunksizes[0]))))
-    chunks_x = tuple(min(chunksizes[1], shape[1] - i*chunksizes[1],) for i in range(int(np.ceil(shape[1]/chunksizes[1]))))
+    chunks_y = tuple(
+        min(
+            chunksizes[0],
+            shape[0] - i * chunksizes[0],
+        )
+        for i in range(int(np.ceil(shape[0] / chunksizes[0])))
+    )
+    chunks_x = tuple(
+        min(
+            chunksizes[1],
+            shape[1] - i * chunksizes[1],
+        )
+        for i in range(int(np.ceil(shape[1] / chunksizes[1])))
+    )
 
     return chunks_y, chunks_x
 
-def _combined_blocks_shape_transform(sub_block_ids: list[dict[str, int]], src_geogrid: GeoGrid) -> tuple[dict[str, Any], list[dict[str, int]]]:
+
+def _combined_blocks_shape_transform(
+    sub_block_ids: list[dict[str, int]], src_geogrid: GeoGrid
+) -> tuple[dict[str, Any], list[dict[str, int]]]:
     """Derive combined shape and transform from a subset of several blocks (for source input during reprojection)."""
 
     # Get combined shape by taking min of X/Y starting indices, max of X/Y ending indices
@@ -503,7 +566,10 @@ def _combined_blocks_shape_transform(sub_block_ids: list[dict[str, int]], src_ge
 
     # Compute relative block indexes that will be needed to reconstruct a square array in the delayed function,
     # by subtracting the minimum starting indices in X/Y
-    relative_block_indexes = [{"r"+s1+s2: b[s1+s2] - minmaxs["min_"+s1+"s"] for s1 in ["x", "y"] for s2 in ["s", "e"]} for b in sub_block_ids]
+    relative_block_indexes = [
+        {"r" + s1 + s2: b[s1 + s2] - minmaxs["min_" + s1 + "s"] for s1 in ["x", "y"] for s2 in ["s", "e"]}
+        for b in sub_block_ids
+    ]
 
     combined_meta = {"src_shape": combined_shape, "src_transform": tuple(combined_transform)}
 
@@ -511,9 +577,11 @@ def _combined_blocks_shape_transform(sub_block_ids: list[dict[str, int]], src_ge
 
 
 @dask.delayed
-def _delayed_reproject_per_block(*src_arrs: tuple[np.ndarray], block_ids: list[dict[str, int]], combined_meta: dict[str, Any], **kwargs: Any) -> np.ndarray:
+def _delayed_reproject_per_block(
+    *src_arrs: tuple[np.ndarray], block_ids: list[dict[str, int]], combined_meta: dict[str, Any], **kwargs: Any
+) -> np.ndarray:
     """
-    Delayed reprojection per destination block (need to rebuild a square source array combined from intersecting blocks).
+    Delayed reprojection per destination block (also rebuilds a square array combined from intersecting source blocks).
     """
 
     # If no source chunk intersects, we return a chunk of destination nodata values
@@ -529,7 +597,7 @@ def _delayed_reproject_per_block(*src_arrs: tuple[np.ndarray], block_ids: list[d
     # Then fill it with the source chunks values
     for i, arr in enumerate(src_arrs):
         bid = block_ids[i]
-        comb_src_arr[bid["rys"]:bid["rye"], bid["rxs"]:bid["rxe"]] = arr
+        comb_src_arr[bid["rys"] : bid["rye"], bid["rxs"] : bid["rxe"]] = arr
 
     # Now, we can simply call Rasterio!
 
@@ -541,31 +609,33 @@ def _delayed_reproject_per_block(*src_arrs: tuple[np.ndarray], block_ids: list[d
     dst_arr = np.zeros(combined_meta["dst_shape"], dtype=comb_src_arr.dtype)
 
     _ = rio.warp.reproject(
-            comb_src_arr,
-            dst_arr,
-            src_transform=src_transform,
-            src_crs=kwargs["src_crs"],
-            dst_transform=dst_transform,
-            dst_crs=kwargs["dst_crs"],
-            resampling=kwargs["resampling"],
-            src_nodata=kwargs["src_nodata"],
-            dst_nodata=kwargs["dst_nodata"],
-        )
+        comb_src_arr,
+        dst_arr,
+        src_transform=src_transform,
+        src_crs=kwargs["src_crs"],
+        dst_transform=dst_transform,
+        dst_crs=kwargs["dst_crs"],
+        resampling=kwargs["resampling"],
+        src_nodata=kwargs["src_nodata"],
+        dst_nodata=kwargs["dst_nodata"],
+    )
 
     return dst_arr
 
 
-def delayed_reproject(darr: da.Array,
-                      src_transform: rio.transform.Affine,
-                      src_crs: rio.crs.CRS,
-                      dst_transform: rio.transform.Affine,
-                      dst_shape: tuple[int, int],
-                      dst_crs: rio.crs.CRS,
-                      resampling: rio.enums.Resampling,
-                      src_nodata: int | float = None,
-                      dst_nodata: int | float = None,
-                      dst_chunksizes: tuple[int, int] = None,
-                      **kwargs: Any) -> da.Array:
+def delayed_reproject(
+    darr: da.Array,
+    src_transform: rio.transform.Affine,
+    src_crs: rio.crs.CRS,
+    dst_transform: rio.transform.Affine,
+    dst_shape: tuple[int, int],
+    dst_crs: rio.crs.CRS,
+    resampling: rio.enums.Resampling,
+    src_nodata: int | float = None,
+    dst_nodata: int | float = None,
+    dst_chunksizes: tuple[int, int] = None,
+    **kwargs: Any,
+) -> da.Array:
     """
     Reproject georeferenced raster on out-of-memory chunks.
 
@@ -614,8 +684,12 @@ def delayed_reproject(darr: da.Array,
     # 3/ To reconstruct a square source array during chunked reprojection, we need to derive the combined shape and
     # transform of each tuples of source blocks
     src_block_ids = np.array(src_geotiling.get_block_locations())
-    meta_params = [_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
-                   if len(sbid) > 0 else ({}, []) for sbid in dest2source]
+    meta_params = [
+        _combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
+        if len(sbid) > 0
+        else ({}, [])
+        for sbid in dest2source
+    ]
     # We also add the output transform/shape for this destination chunk in the combined meta
     # (those are the only two that are chunk-specific)
     dst_block_geogrids = dst_geotiling.get_blocks_as_geogrids()
@@ -625,28 +699,40 @@ def delayed_reproject(darr: da.Array,
     # 4/ Call a delayed function that uses rio.warp to reproject the combined source block(s) to each destination block
 
     # Add fixed arguments to keywords
-    kwargs.update({"src_nodata": src_nodata, "dst_nodata": dst_nodata, "resampling": resampling,
-                   "src_crs": src_crs, "dst_crs": dst_crs})
+    kwargs.update(
+        {
+            "src_nodata": src_nodata,
+            "dst_nodata": dst_nodata,
+            "resampling": resampling,
+            "src_crs": src_crs,
+            "dst_crs": dst_crs,
+        }
+    )
 
     # Create a delayed object for each block, and flatten the blocks into a 1d shape
     blocks = darr.to_delayed().ravel()
     # Run the delayed reprojection, looping for each destination block
-    list_reproj = [_delayed_reproject_per_block(*blocks[dest2source[i]],
-                                                block_ids=meta_params[i][1],
-                                                combined_meta=meta_params[i][0],
-                                                **kwargs)
-                   for i in range(len(dest2source))
-                   ]
+    list_reproj = [
+        _delayed_reproject_per_block(
+            *blocks[dest2source[i]], block_ids=meta_params[i][1], combined_meta=meta_params[i][0], **kwargs
+        )
+        for i in range(len(dest2source))
+    ]
 
     # We define the expected output shape and dtype to simplify things for Dask
-    list_reproj_delayed = [da.from_delayed(r, shape=dst_block_geogrids[i].shape, dtype=darr.dtype) for i, r in
-                           enumerate(list_reproj)]
+    list_reproj_delayed = [
+        da.from_delayed(r, shape=dst_block_geogrids[i].shape, dtype=darr.dtype) for i, r in enumerate(list_reproj)
+    ]
 
     # Array comes out as flat blocks x chunksize0 (varying) x chunksize1 (varying), so we can't reshape directly
     # We need to unravel the flattened blocks indices to align X/Y, then concatenate all columns, then rows
-    indexes_xi, indexes_yi = np.unravel_index(np.arange(len(dest2source)), shape=(len(dst_chunks[0]), len(dst_chunks[1])))
+    indexes_xi, indexes_yi = np.unravel_index(
+        np.arange(len(dest2source)), shape=(len(dst_chunks[0]), len(dst_chunks[1]))
+    )
 
-    lists_columns = [[l for i, l in enumerate(list_reproj_delayed) if j == indexes_xi[i]] for j in range(len(dst_chunks[0]))]
+    lists_columns = [
+        [l for i, l in enumerate(list_reproj_delayed) if j == indexes_xi[i]] for j in range(len(dst_chunks[0]))
+    ]
     concat_columns = [da.concatenate(c, axis=1) for c in lists_columns]
     concat_all = da.concatenate(concat_columns, axis=0)
 
diff --git a/requirements.txt b/requirements.txt
index 68bac6e0..149d6fe3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ numpy==1.*
 scipy==1.*
 tqdm
 xarray
+dask
 rioxarray==0.*
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 975b1033..b0e3c118 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,16 +1,20 @@
 """Tests for dask functions (temporarily in xDEM, might move to GeoUtils)."""
 import os
 
+import dask
 import numpy as np
 import pytest
+import rasterio as rio
 import xarray as xr
-import dask.array as da
 from pyproj import CRS
-import rasterio as rio
-import dask
 
 from geoutils.examples import _EXAMPLES_DIRECTORY
-from geoutils.raster.delayed import delayed_subsample, delayed_interp_points, delayed_reproject
+from geoutils.raster.delayed import (
+    delayed_interp_points,
+    delayed_reproject,
+    delayed_subsample,
+)
+
 
 class TestDelayed:
 
@@ -65,8 +69,8 @@ def test_delayed_interp_points(self, ninterp: int):
         interp1 = delayed_interp_points(darr=darr, points=(interp_x, interp_y), resolution=(1, 1))
 
         # Interpolate directly with Xarray and compare results are the same
-        xx = xr.DataArray(interp_x, dims='z', name='x')
-        yy = xr.DataArray(interp_y, dims='z', name='y')
+        xx = xr.DataArray(interp_x, dims="z", name="x")
+        yy = xr.DataArray(interp_y, dims="z", name="y")
         interp2 = ds.test.interp(x=xx, y=yy)
         interp2.compute()
         interp2 = np.array(interp2.values)
@@ -80,7 +84,7 @@ def test_delayed_reproject(self):
         ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
         darr = ds["test"].data
 
-        dask.config.set(scheduler='single-threaded')
+        dask.config.set(scheduler="single-threaded")
 
         src_shape = darr.shape
 
@@ -97,7 +101,6 @@ def test_delayed_reproject(self):
         dst_chunksizes = (7, 5)
 
         # Build an intersecting dst_transform that is not aligned
-        src_res = (src_transform[0], abs(src_transform[4]))
         bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(src_shape[0], src_shape[1], src_transform))
         # First, an aligned transform in the new CRS that allows to get
         # temporary new bounds and resolution in the units of the new CRS
@@ -116,8 +119,9 @@ def test_delayed_reproject(self):
         tmp_res = (tmp_transform[0], abs(tmp_transform[4]))
         tmp_bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(dst_shape[0], dst_shape[1], tmp_transform))
         # Now we modify the destination grid by changing bounds by a bit + the resolution
-        dst_transform = rio.transform.from_origin(tmp_bounds.left + 100*tmp_res[0], tmp_bounds.top + 150*tmp_res[0],
-                                                  tmp_res[0]*2.5, tmp_res[1]*0.7)
+        dst_transform = rio.transform.from_origin(
+            tmp_bounds.left + 100 * tmp_res[0], tmp_bounds.top + 150 * tmp_res[0], tmp_res[0] * 2.5, tmp_res[1] * 0.7
+        )
 
         # Other arguments
         src_nodata = -9999
@@ -125,9 +129,18 @@ def test_delayed_reproject(self):
         resampling = rio.enums.Resampling.bilinear
 
         # Run delayed reproject
-        reproj_arr = delayed_reproject(darr, src_transform=src_transform, src_crs=src_crs, dst_transform=dst_transform,
-                                       dst_crs=dst_crs, dst_shape=dst_shape, src_nodata=src_nodata, dst_nodata=dst_nodata,
-                                       resampling=resampling, dst_chunksizes=dst_chunksizes)
+        reproj_arr = delayed_reproject(
+            darr,
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_transform=dst_transform,
+            dst_crs=dst_crs,
+            dst_shape=dst_shape,
+            src_nodata=src_nodata,
+            dst_nodata=dst_nodata,
+            resampling=resampling,
+            dst_chunksizes=dst_chunksizes,
+        )
 
         # Save file out-of-memory
         # TODO: Would need to wrap the georef data in the netCDF, but not needed to test this

From 57ad08df7ee6fd3c4fae3ffc5871b7d432d42ce8 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Thu, 25 Apr 2024 22:19:21 -0800
Subject: [PATCH 05/22] Finalize tests

---
 dev-environment.yml               |   2 +
 geoutils/raster/delayed.py        |  17 +-
 tests/test_raster/test_delayed.py | 482 ++++++++++++++++++++++++------
 3 files changed, 412 insertions(+), 89 deletions(-)

diff --git a/dev-environment.yml b/dev-environment.yml
index 50f65391..04256a02 100644
--- a/dev-environment.yml
+++ b/dev-environment.yml
@@ -28,6 +28,8 @@ dependencies:
   - pyyaml
   - flake8
   - pylint
+  - netcdf4  # To write synthetic data with chunksizes
+  - dask-memusage
 
   # Doc dependencies
   - sphinx
diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 6fc8b668..bc118564 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -84,6 +84,7 @@ def _get_indices_block_per_subsample(
     # We can write a faster algorithm by sorting
     xxs = np.sort(xxs)
 
+    # TODO: Write nested lists into array format to further save RAM?
     # We define a list of indices per block
     relative_ind_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
     k = 0  # K is the block number
@@ -372,8 +373,9 @@ def delayed_interp_points(
 
 
 # 3/ REPROJECT
-# Part of the code (defining a GeoGrid and GeoTiling classes) in inspired by
-# https://github.com/opendatacube/odc-geo/pull/88, modified to be more concise and rely only on Rasterio/GeoPandas
+# Part of the code (defining a GeoGrid and GeoTiling classes) is inspired by
+# https://github.com/opendatacube/odc-geo/pull/88, modified to be concise, stand-alone and rely only on
+# Rasterio/GeoPandas
 # Could be submitted as a PR to Rioxarray? (but not sure the dependency to GeoPandas would work there)
 
 # We define a GeoGrid and GeoTiling class (which composes GeoGrid) to consistently deal with georeferenced footprints
@@ -444,8 +446,13 @@ def shift(self, xoff: float, yoff: float, distance_unit: Literal["georeferenced"
 
         # Convert pixel offsets to georeferenced units
         if distance_unit == "pixel":
-            xoff *= self.res[0]
-            yoff *= self.res[1]
+            # Multiplying the offset by the resolution might amplify floating precision issues
+            # xoff *= self.res[0]
+            # yoff *= self.res[1]
+
+            # Using the boundaries instead!
+            xoff = xoff / self.shape[1] * (self.bounds.right - self.bounds.left)
+            yoff = yoff / self.shape[0] * (self.bounds.top - self.bounds.bottom)
 
         shifted_transform = rio.transform.Affine(dx, b, xmin + xoff, d, dy, ymax + yoff)
 
@@ -597,7 +604,7 @@ def _delayed_reproject_per_block(
     # Then fill it with the source chunks values
     for i, arr in enumerate(src_arrs):
         bid = block_ids[i]
-        comb_src_arr[bid["rys"] : bid["rye"], bid["rxs"] : bid["rxe"]] = arr
+        comb_src_arr[bid["rys"]:bid["rye"], bid["rxs"]:bid["rxe"]] = arr
 
     # Now, we can simply call Rasterio!
 
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index b0e3c118..a59b7d91 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,12 +1,16 @@
-"""Tests for dask functions (temporarily in xDEM, might move to GeoUtils)."""
+"""Tests for dask-delayed functions."""
 import os
+from tempfile import NamedTemporaryFile
+from typing import Callable, Any
 
-import dask
+import dask.array as da
 import numpy as np
+import pandas as pd
 import pytest
 import rasterio as rio
 import xarray as xr
 from pyproj import CRS
+from dask.distributed import Client, LocalCluster
 
 from geoutils.examples import _EXAMPLES_DIRECTORY
 from geoutils.raster.delayed import (
@@ -15,34 +19,275 @@
     delayed_subsample,
 )
 
+from pluggy import PluggyTeardownRaisedWarning
+from dask_memusage import install
+
+# Ignore teardown warning given by Dask when closing the local cluster (due to dask-memusage plugin)
+pytestmark = pytest.mark.filterwarnings("ignore", category=PluggyTeardownRaisedWarning)
+
+# Fixture to use a single cluster for the entire module
+@pytest.fixture(scope='module')
+def cluster():
+    # Need cluster to be single-threaded to use dask-memusage confidently
+    dask_cluster = LocalCluster(n_workers=2, threads_per_worker=1, dashboard_address=None)
+    yield dask_cluster
+    dask_cluster.close()
+
+def _run_dask_measuring_memusage(cluster, dask_func: Callable, *args_dask_func: Any, **kwargs_dask_func: Any) -> tuple[Any, float]:
+    """Run a dask function monitoring its memory usage."""
+
+    # Create a name temporary file that won't delete immediatley
+    fn_tmp_csv = NamedTemporaryFile(suffix=".csv", delete=False).name
+
+    # Setup cluster and client within context managers for a clean shutdown
+    install(cluster.scheduler, fn_tmp_csv)
+    with Client(cluster) as client:
+        outputs = dask_func(*args_dask_func, **kwargs_dask_func)
+
+    # Read memusage file and cleanup
+    df = pd.read_csv(fn_tmp_csv)
+    os.remove(fn_tmp_csv)
+
+    # Keep only non-zero memory usage
+    ind_nonzero = df.max_memory_mb != 0
+
+    # Compute peak additional memory usage from min baseline
+    memusage_mb = np.max(df.max_memory_mb[ind_nonzero]) - np.min(df.max_memory_mb[ind_nonzero])
+
+    return outputs, memusage_mb
+
+def _estimate_subsample_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int) -> float:
+    """
+    Estimate the theoretical memory usage of the delayed subsampling method.
+    (we don't need to be super precise, just within a factor of ~2 to check memory usage performs as expected)
+    """
+
+    # TOTAL SIZE = Single chunk operations + Subsample indexes + Metadata passed to dask + Outputs
+
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
+    fac_dask_margin = 2
+    num_chunks = np.prod(darr.numblocks)
+
+    # Single chunk operation = (data type bytes + boolean from np.isfinite) * chunksize **2
+    chunk_memusage = (darr.dtype.itemsize + np.dtype("bool").itemsize) * np.prod(chunksizes_in_mem)
+
+    # 1D index subsample size: integer type * subsample_size
+    sample_memusage = np.dtype("int32").itemsize * subsample_size
+
+    # Outputs: number of valid pixels + subsample
+    valids_memusage = np.dtype("int32").itemsize * num_chunks
+    subout_memusage = np.dtype(darr.dtype).itemsize * subsample_size
+    out_memusage = valids_memusage + subout_memusage
+
+    # Size of metadata passed to dask: number of blocks times its content
+    # Content of a metadata block = list (block size) of list (subsample size) of integer indexes
+    size_index_int = 28  # Python size for int
+    list_all_blocks = 64 + 8 * num_chunks  # A list is 64 + 8 bits per element, without memory of contained elements
+    list_per_block = 64 * num_chunks + 8 * subsample_size + size_index_int * subsample_size  # Rough max estimate
+    meta_memusage = list_per_block + list_all_blocks
+
+    # Final estimate of memory usage of operation in MB
+    max_op_memusage = fac_dask_margin * (chunk_memusage + sample_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+
+    return max_op_memusage
+
+def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], ninterp: int) -> float:
+    """
+    Estimate the theoretical memory usage of the delayed interpolation method.
+    (we don't need to be super precise, just within a factor of ~2 to check memory usage performs as expected)
+    """
+
+    # TOTAL SIZE = Single chunk operations + Chunk overlap + Metadata passed to dask + Outputs
+
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
+    fac_dask_margin = 2
+    num_chunks = np.prod(darr.numblocks)
+
+    # Single chunk operation = (data type bytes + boolean from np.isfinite) * chunksize **2
+    chunk_memusage = (darr.dtype.itemsize + np.dtype("bool").itemsize) * np.prod(chunksizes_in_mem)
+    # For interpolation, chunks have to overlap and temporarily load each neighbouring chunk,
+    # we add 8 neighbouring chunks
+    chunk_memusage += darr.dtype.itemsize * np.prod(chunksizes_in_mem) * 8
+
+    # Outputs: interpolate coordinates
+    out_memusage = np.dtype(darr.dtype).itemsize * ninterp * 2
+
+    # Size of metadata passed to dask: number of blocks times its content
+    # Content of a metadata block = list (block size) of list (subsample size) of integer
+    size_index_int = 28  # Python size for int
+    size_index_float = 24  # Python size for float
+    list_all_blocks = 64 + 8 * num_chunks  # A list is 64 + 8 bits per element, without memory of contained elements
+    list_per_block = 64 * num_chunks + 8 * ninterp + size_index_int * ninterp  # Rough max estimate
+    # And a list for each block of dict with 4 floats (xres, yres, xstart, ystart)
+    dict_all_blocks = 64 * num_chunks + 4 * size_index_float * 64 * num_chunks
+    meta_memusage = list_per_block + list_all_blocks + dict_all_blocks
+
+    # Final estimate of memory usage of operation in MB
+    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+
+    return max_op_memusage
+
+def _estimate_reproject_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], dst_chunksizes: tuple[int, int],
+                                 rel_res_fac: tuple[float, float]) -> float:
+    """
+    Estimate the theoretical memory usage of the delayed reprojection method.
+    (we don't need to be super precise, just within a factor of ~2 to check memory usage performs as expected)
+    """
+
+    # TOTAL SIZE = Combined source chunk operations + Building geopandas mapping + Metadata passed to dask + Outputs
+
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
+    fac_dask_margin = 2
+    num_chunks = np.prod(darr.numblocks)
+
+    # THE BIG QUESTION: how many maximum source chunks might be loaded for a single destination chunk?
+    # It depends on the relative ratio of input chunksizes to destination chunksizes, accounting for resolution change
+    x_rel_source_chunks = dst_chunksizes[0] / chunksizes_in_mem[0] * rel_res_fac[0]
+    y_rel_source_chunks = dst_chunksizes[1] / chunksizes_in_mem[1] * rel_res_fac[1]
+    # There is also some overlap needed for resampling and due to warping in different CRS, so let's multiply this by 8
+    # (all neighbouring tiles)
+    nb_source_chunks_per_dest = 8 * x_rel_source_chunks * y_rel_source_chunks
+
+    # Combined memory usage of one chunk operation = squared array made from combined chunksize + original chunks
+    total_nb = np.ceil(np.sqrt(nb_source_chunks_per_dest))**2 + nb_source_chunks_per_dest
+    # We multiply the memory usage of a single chunk to the number of loaded/combined chunks
+    chunk_memusage = darr.dtype.itemsize * np.prod(chunksizes_in_mem) * total_nb
+
+    # Outputs: reprojected raster
+    out_memusage = np.dtype(darr.dtype).itemsize * np.prod(dst_chunksizes)
+
+    # Size of metadata passed to dask: number of blocks times its content
+    # For each block, we pass a dict with 4 integers per source chunk (rxs, rxe, rys, rye)
+    size_index_float = 24  # Python size for float
+    size_index_int = 28  # Python size for float
+    dict_all_blocks = (64 + 4 * size_index_int * nb_source_chunks_per_dest) * num_chunks
+    # Passing the 2 CRS, 2 transforms, resampling methods and 2 nodatas
+    combined_meta = (112 + 112 + 56 + 56 + 44 + 28 + 28) * size_index_float * num_chunks
+    meta_memusage = combined_meta + dict_all_blocks
+
+    # Final estimate of memory usage of operation in MB
+    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+
+    return max_op_memusage
+
+
+def _build_dst_transform_shifted_newres(src_transform: rio.transform.Affine,
+                                        src_shape: tuple[int, int],
+                                        src_crs: CRS,
+                                        dst_crs: CRS,
+                                        bounds_rel_shift: tuple[float, float],
+                                        res_rel_fac: tuple[float, float]) -> rio.transform.Affine:
+    """
+    Build a destination transform intersecting the source transform given source/destination shapes,
+    and possibly introducing a relative shift in upper-left bound and multiplicative change in resolution.
+    """
+
+    # Get bounding box in source CRS
+    bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(src_shape[0], src_shape[1], src_transform))
+
+    # Construct an aligned transform in the destination CRS assuming the same resolution
+    tmp_transform = rio.warp.calculate_default_transform(
+        src_crs,
+        dst_crs,
+        src_shape[1],
+        src_shape[0],
+        left=bounds.left,
+        right=bounds.right,
+        top=bounds.top,
+        bottom=bounds.bottom,
+        dst_width=src_shape[1],
+        dst_height=src_shape[0],
+    )[0]
+    # This allows us to get bounds and resolution in the units of the new CRS
+    tmp_res = (tmp_transform[0], abs(tmp_transform[4]))
+    tmp_bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(src_shape[0], src_shape[1], tmp_transform))
+    # Now we can create a shifted/different-res destination grid
+    dst_transform = rio.transform.from_origin(
+        west=tmp_bounds.left + bounds_rel_shift[0] * tmp_res[0] * src_shape[1],
+        north=tmp_bounds.top + 150 * bounds_rel_shift[0] * tmp_res[1] * src_shape[0],
+        xsize=tmp_res[0] / res_rel_fac[0],
+        ysize=tmp_res[1] / res_rel_fac[1])
+
+    return dst_transform
 
 class TestDelayed:
+    """
+    Testing delayed functions is pretty straightforward.
+
+    We test on rasters big enough to clearly monitor the memory usage, but small enough to fit in-memory to check
+    the function outputs against the ones from in-memory methods.
+
+    In details:
+    1. We compare to the in-memory function output for the set of input variables that are used to build the delayed
+        algorithm and might lead to new errors (for example: array shape to get subsample/points locations for
+        subsample and interp_points, or destination  chunksizes to map output of reproject).
+    2. During execution, we capture memory usage and check that only the expected amount of memory
+        (one or several chunk combinations + metadata) is used during the compute() call.
+     """
+
+    # Write big test files on disk out-of-memory, with different input shapes not necessarily aligned between themselves
+    # or with chunks
+    fn_nc_shape = {"test_square.nc": (10000, 10000),
+                   "test_complex.nc": (5511, 6768)}
+    # We can use a constant value for storage chunks, it doesn't have any influence on the accuracy of delayed methods
+    # (can change slightly RAM usage, but pretty stable as long as chunksizes in memory are larger and
+    # significantly bigger)
+    chunksizes_on_disk = (200, 200)
+    list_fn = []
+    for fn_basename in fn_nc_shape.keys():
+        fn = os.path.join(_EXAMPLES_DIRECTORY, fn_basename)
+        list_fn.append(fn)
+        if not os.path.exists(fn):
+            # Create random array in the right shape
+            test_shape = fn_nc_shape[fn_basename]
+            rng = da.random.default_rng(seed=42)
+            data = rng.normal(size=test_shape[0] * test_shape[1]).reshape(test_shape[0], test_shape[1])
+            data_arr = xr.DataArray(data=data, dims=["x", "y"])
+            ds = xr.Dataset(data_vars={"test": data_arr})
+            encoding_kwargs = {"test": {"chunksizes": chunksizes_on_disk}}
+            writer = ds.to_netcdf(fn, encoding=encoding_kwargs, compute=False)
+            writer.compute()
+
+    @pytest.mark.parametrize("fn", list_fn)
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
+    @pytest.mark.parametrize("subsample_size", [100, 100000])
+    def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any):
+        """
+        Checks for delayed subsampling function, both for output and memory usage.
+        Variables that influence specifically the delayed function tested here:
+        - Input chunksizes,
+        - Input array shape,
+        - Number of subsampled points.
+        """
 
-    # Create test file (replace with data in time?)
-    fn_tmp = os.path.join(_EXAMPLES_DIRECTORY, "test.nc")
-    if not os.path.exists(fn_tmp):
-        data = np.random.normal(size=400000000).reshape(20000, 20000)
-        data_arr = xr.DataArray(data=data, dims=["x", "y"])
-        ds = xr.Dataset(data_vars={"test": data_arr})
-        encoding_kwargs = {"test": {"chunksizes": (100, 100)}}
-        ds.to_netcdf(fn_tmp, encoding=encoding_kwargs)
-        del ds, data_arr, data
-
-    # Chunk size in memory
-    chunksize = 500
-
-    @pytest.mark.parametrize("subsample_size", [2, 100, 100000])
-    def test_delayed_subsample(self, subsample_size: int):
-        """Checks for delayed subsampling function."""
-
-        # Open dataset with chunks
-        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
+        # 0/ Open dataset with chunks
+        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
         darr = ds["test"].data
 
+        # 1/ Estimation of theoretical memory usage of the subsampling script
+
+        max_op_memusage = _estimate_subsample_memusage(darr=darr, chunksizes_in_mem=chunksizes_in_mem,
+                                                       subsample_size=subsample_size)
+
+        # 2/ Run delayed subsample with dask memory usage monitoring
+
         # Derive subsample from delayed function
-        sub = delayed_subsample(darr, subsample=subsample_size, random_state=42)
+        # (passed to wrapper function to measure memory usage during execution)
+        sub, measured_op_memusage = _run_dask_measuring_memusage(cluster, delayed_subsample, darr,
+                                                                 subsample=subsample_size, random_state=42)
+
+        # Check the measured memory usage is smaller than the maximum estimated one
+        assert measured_op_memusage < max_op_memusage
 
-        # The subsample should have exactly the prescribed length, with only valid values
+        # 3/ Output checks
+
+        # # The subsample should have exactly the prescribed length, with only valid values
         assert len(sub) == subsample_size
         assert all(np.isfinite(sub))
 
@@ -52,23 +297,41 @@ def test_delayed_subsample(self, subsample_size: int):
         sub2 = np.array(darr.vindex[indices[0], indices[1]])
         assert np.array_equal(sub, sub2)
 
-    @pytest.mark.parametrize("ninterp", [2, 100, 100000])
-    def test_delayed_interp_points(self, ninterp: int):
-        """Checks for delayed interpolate points function."""
+    @pytest.mark.parametrize("fn", list_fn)
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
+    @pytest.mark.parametrize("ninterp", [100, 100000])
+    def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any):
+        """
+        Checks for delayed interpolate points function.
+        Variables that influence specifically the delayed function tested here:
+        - Input chunksizes,
+        - Input array shape,
+        - Number of interpolated points.
+        """
 
-        # Open dataset with chunks
-        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
+        # 0/ Open dataset with chunks and create random point locations to interpolate
+        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
         darr = ds["test"].data
 
-        # Create random point coordinates to interpolate
         rng = np.random.default_rng(seed=42)
         interp_x = rng.choice(ds.x.size, ninterp) + rng.random(ninterp)
         interp_y = rng.choice(ds.y.size, ninterp) + rng.random(ninterp)
 
-        # Interpolate with delayed function
-        interp1 = delayed_interp_points(darr=darr, points=(interp_x, interp_y), resolution=(1, 1))
+        # 1/ Estimation of theoretical memory usage of the subsampling script
+        max_op_memusage = _estimate_interp_points_memusage(darr=darr, chunksizes_in_mem=chunksizes_in_mem,
+                                                           ninterp=ninterp)
+
+
+        # 2/ Run interpolation of random point coordinates with memory monitoring
+
+        interp1, measured_op_memusage = _run_dask_measuring_memusage(cluster, delayed_interp_points, darr,
+                                                                     points=(interp_x, interp_y), resolution=(1, 1))
+        # Check the measured memory usage is smaller than the maximum estimated one
+        assert measured_op_memusage < max_op_memusage
 
-        # Interpolate directly with Xarray and compare results are the same
+        # 3/ Output checks
+
+        # Interpolate directly with Xarray (loads a lot in memory) and check results are exactly the same
         xx = xr.DataArray(interp_x, dims="z", name="x")
         yy = xr.DataArray(interp_y, dims="z", name="y")
         interp2 = ds.test.interp(x=xx, y=yy)
@@ -77,59 +340,88 @@ def test_delayed_interp_points(self, ninterp: int):
 
         assert np.array_equal(interp1, interp2, equal_nan=True)
 
-    # Let's check a lot of different scenarios
-    def test_delayed_reproject(self):
+    @pytest.mark.parametrize("fn", list_fn)
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
+    @pytest.mark.parametrize("dst_chunksizes", [(2000, 2000), (1398, 2983)])
+    # Shift upper left corner of output bounds (relative to projected input bounds) by fractions of the raster size
+    @pytest.mark.parametrize("dst_bounds_rel_shift", [(0, 0), (-0.2, 0.5)])
+    # Modify output resolution (relative to projected input resolution) by a factor
+    @pytest.mark.parametrize("dst_res_rel_fac", [(1, 1), (2.1, 0.54)])
+    # Same for shape
+    @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])
+    def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
+                               dst_chunksizes: tuple[int, int], dst_bounds_rel_shift: tuple[float, float],
+                               dst_res_rel_fac: tuple[float, float], dst_shape_diff: tuple[int, int],
+                               cluster: Any):
+        """
+        Checks for the delayed reproject function.
+        Variables that influence specifically the delayed function tested here:
+        - Input/output chunksizes,
+        - Input array shape,
+        - Output geotransform relative to projected input geotransform,
+        - Output array shape relative to input.
+        """
 
-        # Open dataset with chunks
-        ds = xr.open_dataset(self.fn_tmp, chunks={"x": self.chunksize, "y": self.chunksize})
-        darr = ds["test"].data
+        fn = list_fn[0]
+        chunksizes_in_mem = (2000, 2000)
+        dst_chunksizes = (1398, 2983)  # (2000, 2000)
+        dst_bounds_rel_shift = (0, 0)
+        dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
+        dst_shape_diff = (0, 0)
+        cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
 
-        dask.config.set(scheduler="single-threaded")
+        # 0/ Open dataset with chunks and define variables
+        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
+        darr = ds["test"].data
 
+        # Get input and output shape
         src_shape = darr.shape
+        dst_shape = (src_shape[0] + dst_shape_diff[0], src_shape[1] + dst_shape_diff[1])
 
-        # src_shape = (150, 100)
-        # src_chunksizes = (25, 10)
-        # rng = da.random.default_rng(seed=42)
-        # darr = rng.normal(size=src_shape, chunks=src_chunksizes)
-        # darr = da.ones(src_shape, chunks=src_chunksizes)
+        # Define arbitrary input/output CRS, they don't have a direct influence on the delayed method
+        # (as long as the input/output transforms intersect if projected in the same CRS)
         src_crs = CRS(4326)
-        src_transform = rio.transform.from_bounds(0, 0, 5, 5, src_shape[0], src_shape[1])
-
-        dst_shape = (77, 50)
-        dst_crs = CRS(32631)
-        dst_chunksizes = (7, 5)
-
-        # Build an intersecting dst_transform that is not aligned
-        bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(src_shape[0], src_shape[1], src_transform))
-        # First, an aligned transform in the new CRS that allows to get
-        # temporary new bounds and resolution in the units of the new CRS
-        tmp_transform = rio.warp.calculate_default_transform(
-            src_crs,
-            dst_crs,
-            src_shape[1],
-            src_shape[0],
-            left=bounds.left,
-            right=bounds.right,
-            top=bounds.top,
-            bottom=bounds.bottom,
-            dst_width=dst_shape[1],
-            dst_height=dst_shape[0],
-        )[0]
-        tmp_res = (tmp_transform[0], abs(tmp_transform[4]))
-        tmp_bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(dst_shape[0], dst_shape[1], tmp_transform))
-        # Now we modify the destination grid by changing bounds by a bit + the resolution
-        dst_transform = rio.transform.from_origin(
-            tmp_bounds.left + 100 * tmp_res[0], tmp_bounds.top + 150 * tmp_res[0], tmp_res[0] * 2.5, tmp_res[1] * 0.7
-        )
+        dst_crs = CRS(32630)
+
+        # Define arbitrary input transform, as we only care about the relative difference of the output transform
+        src_transform = rio.transform.from_bounds(10, 10, 15, 15, src_shape[0], src_shape[1])
 
-        # Other arguments
+        # Other arguments having no influence
         src_nodata = -9999
         dst_nodata = 99999
-        resampling = rio.enums.Resampling.bilinear
+        resampling = rio.enums.Resampling.cubic
+
+        # Get shifted dst_transform with new resolution
+        dst_transform = _build_dst_transform_shifted_newres(src_transform=src_transform, src_crs=src_crs, dst_crs=dst_crs,
+                                                            src_shape=src_shape, bounds_rel_shift=dst_bounds_rel_shift,
+                                                            res_rel_fac=dst_res_rel_fac)
+
+        # 1/ Estimation of theoretical memory usage of the subsampling script
+
+        max_op_memusage = _estimate_reproject_memusage(darr, chunksizes_in_mem=chunksizes_in_mem, dst_chunksizes=dst_chunksizes,
+                                                       rel_res_fac=dst_res_rel_fac)
+
+        # 2/ Run delayed reproject with memory monitoring
 
-        # Run delayed reproject
-        reproj_arr = delayed_reproject(
+        # We define a function where computes happens during writing to be able to measure memory usage
+        # (delayed_reproject returns a delayed array that might not fit in memory, unlike subsampling/interpolation)
+        fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, os.path.splitext(os.path.basename(fn))[0] + "_reproj.nc")
+
+        def reproject_and_write(*args, **kwargs):
+
+            # Run delayed reprojection
+            reproj_arr_tmp = delayed_reproject(*args, **kwargs)
+
+            # Save file out-of-memory and compute
+            data_arr = xr.DataArray(data=reproj_arr_tmp, dims=["x", "y"])
+            ds_out = xr.Dataset(data_vars={"test_reproj": data_arr})
+            write_delayed = ds_out.to_netcdf(fn_tmp_out, compute=False)
+            write_delayed.compute()
+
+        # And call this function with memory usage monitoring
+        _, measured_op_memusage = _run_dask_measuring_memusage(
+            cluster,
+            reproject_and_write,
             darr,
             src_transform=src_transform,
             src_crs=src_crs,
@@ -142,15 +434,12 @@ def test_delayed_reproject(self):
             dst_chunksizes=dst_chunksizes,
         )
 
-        # Save file out-of-memory
-        # TODO: Would need to wrap the georef data in the netCDF, but not needed to test this
-        fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, "test_reproj.nc")
-        data_arr = xr.DataArray(data=reproj_arr, dims=["x", "y"])
-        ds_out = xr.Dataset(data_vars={"test_reproj": data_arr})
-        write_delayed = ds_out.to_netcdf(fn_tmp_out, compute=False)
-        write_delayed.compute()
+        # Check the measured memory usage is smaller than the maximum estimated one
+        assert measured_op_memusage < max_op_memusage
+
+        # 3/ Outputs check: load in memory and compare with a direct Rasterio reproject
+        reproj_arr = xr.open_dataset(fn_tmp_out)["test_reproj"].values
 
-        # Load in-memory and compare with a direct reproject
         dst_arr = np.zeros(dst_shape)
         _ = rio.warp.reproject(
             np.array(darr),
@@ -164,4 +453,29 @@ def test_delayed_reproject(self):
             dst_nodata=dst_nodata,
         )
 
-        assert np.allclose(reproj_arr.compute(), dst_arr, atol=0.02)
+        # Keeping this to debug in case this is not only a Rasterio issue
+        # if PLOT:
+            # import matplotlib.pyplot as plt
+            # plt.figure()
+            # plt.imshow((reproj_arr - dst_arr), cmap="RdYlBu", vmin=-0.2, vmax=0.2, interpolation="None")
+            # plt.colorbar()
+            # plt.savefig("/home/atom/ongoing/diff_close_zero.png", dpi=500)
+            # plt.figure()
+            # plt.imshow(np.abs(reproj_arr - dst_arr), cmap="RdYlBu", vmin=99997, vmax=100001, interpolation="None")
+            # plt.colorbar()
+            # plt.savefig("/home/atom/ongoing/diff_nodata.png", dpi=500)
+            # plt.figure()
+            # plt.imshow(dst_arr, cmap="RdYlBu", vmin=-1, vmax=1, interpolation="None")
+            # plt.colorbar()
+            # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
+
+        # Check that very little data (less than 0.01% of pixels) are significantly different
+        # (it seems to be mostly some pixels that are nodata in one and not the other)
+        ind_signif_diff = np.abs(reproj_arr - dst_arr) > 0.5
+        assert np.count_nonzero(ind_signif_diff) < 0.01 / 100 * reproj_arr.size
+
+        # The median difference is negligible compared to the amplitude of the signal (+/- 1 std)
+        assert np.nanmedian(np.abs(reproj_arr - dst_arr)) < 0.02
+
+        # # Replace with allclose once Rasterio issue fixed?
+        # assert np.allclose(reproj_arr[~ind_both_nodata], dst_arr[~ind_both_nodata], atol=0.02)

From 1dd508b0b884e7b6e62af6ebb16bf96f115a34fa Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 11:02:39 -0800
Subject: [PATCH 06/22] Clarify test description slightly more

---
 tests/test_raster/test_delayed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index a59b7d91..3ba617ea 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -224,11 +224,11 @@ class TestDelayed:
     the function outputs against the ones from in-memory methods.
 
     In details:
-    1. We compare to the in-memory function output for the set of input variables that are used to build the delayed
+    1. We compare with the in-memory function output only for the set of input variables that influence the delayed
         algorithm and might lead to new errors (for example: array shape to get subsample/points locations for
-        subsample and interp_points, or destination  chunksizes to map output of reproject).
+        subsample and interp_points, or destination chunksizes to map output of reproject).
     2. During execution, we capture memory usage and check that only the expected amount of memory
-        (one or several chunk combinations + metadata) is used during the compute() call.
+        (one or several chunk combinations + metadata) is indeed used during the compute() call.
      """
 
     # Write big test files on disk out-of-memory, with different input shapes not necessarily aligned between themselves
@@ -260,7 +260,7 @@ class TestDelayed:
     def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any):
         """
         Checks for delayed subsampling function, both for output and memory usage.
-        Variables that influence specifically the delayed function tested here:
+        Variables that influence specifically the delayed function are:
         - Input chunksizes,
         - Input array shape,
         - Number of subsampled points.
@@ -303,7 +303,7 @@ def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], su
     def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any):
         """
         Checks for delayed interpolate points function.
-        Variables that influence specifically the delayed function tested here:
+        Variables that influence specifically the delayed function are:
         - Input chunksizes,
         - Input array shape,
         - Number of interpolated points.
@@ -355,7 +355,7 @@ def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
                                cluster: Any):
         """
         Checks for the delayed reproject function.
-        Variables that influence specifically the delayed function tested here:
+        Variables that influence specifically the delayed function are:
         - Input/output chunksizes,
         - Input array shape,
         - Output geotransform relative to projected input geotransform,

From 03e370ad9801769b9148a000fee2e3e4a4d5a060 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 11:11:04 -0800
Subject: [PATCH 07/22] Add exceptions due to rasterio errors

---
 tests/test_raster/test_delayed.py | 43 +++++++++++++++++--------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 3ba617ea..f0c9c142 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -64,8 +64,8 @@ def _estimate_subsample_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
 
     # TOTAL SIZE = Single chunk operations + Subsample indexes + Metadata passed to dask + Outputs
 
-    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
-    fac_dask_margin = 2
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2.5 to get a good safety margin
+    fac_dask_margin = 2.5
     num_chunks = np.prod(darr.numblocks)
 
     # Single chunk operation = (data type bytes + boolean from np.isfinite) * chunksize **2
@@ -101,8 +101,8 @@ def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[in
 
     # TOTAL SIZE = Single chunk operations + Chunk overlap + Metadata passed to dask + Outputs
 
-    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
-    fac_dask_margin = 2
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2.5 to get a good safety margin
+    fac_dask_margin = 2.5
     num_chunks = np.prod(darr.numblocks)
 
     # Single chunk operation = (data type bytes + boolean from np.isfinite) * chunksize **2
@@ -140,8 +140,8 @@ def _estimate_reproject_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
 
     # TOTAL SIZE = Combined source chunk operations + Building geopandas mapping + Metadata passed to dask + Outputs
 
-    # On top of the rest is added the Dask graph, we will multiply by a factor of 2 to get a good safety margin
-    fac_dask_margin = 2
+    # On top of the rest is added the Dask graph, we will multiply by a factor of 2.5 to get a good safety margin
+    fac_dask_margin = 2.5
     num_chunks = np.prod(darr.numblocks)
 
     # THE BIG QUESTION: how many maximum source chunks might be loaded for a single destination chunk?
@@ -362,13 +362,14 @@ def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
         - Output array shape relative to input.
         """
 
-        fn = list_fn[0]
-        chunksizes_in_mem = (2000, 2000)
-        dst_chunksizes = (1398, 2983)  # (2000, 2000)
-        dst_bounds_rel_shift = (0, 0)
-        dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
-        dst_shape_diff = (0, 0)
-        cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
+        # Keeping this commented here if we need to redo local tests due to Rasterio errors
+        # fn = list_fn[0]
+        # chunksizes_in_mem = (2000, 2000)
+        # dst_chunksizes = (1398, 2983)  # (2000, 2000)
+        # dst_bounds_rel_shift = (0, 0)
+        # dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
+        # dst_shape_diff = (0, 0)
+        # cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
 
         # 0/ Open dataset with chunks and define variables
         ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
@@ -389,7 +390,7 @@ def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
         # Other arguments having no influence
         src_nodata = -9999
         dst_nodata = 99999
-        resampling = rio.enums.Resampling.cubic
+        resampling = rio.enums.Resampling.bilinear
 
         # Get shifted dst_transform with new resolution
         dst_transform = _build_dst_transform_shifted_newres(src_transform=src_transform, src_crs=src_crs, dst_crs=dst_crs,
@@ -453,7 +454,7 @@ def reproject_and_write(*args, **kwargs):
             dst_nodata=dst_nodata,
         )
 
-        # Keeping this to debug in case this is not only a Rasterio issue
+        # Keeping this to visualize Rasterio resampling issue
         # if PLOT:
             # import matplotlib.pyplot as plt
             # plt.figure()
@@ -469,13 +470,15 @@ def reproject_and_write(*args, **kwargs):
             # plt.colorbar()
             # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
 
-        # Check that very little data (less than 0.01% of pixels) are significantly different
-        # (it seems to be mostly some pixels that are nodata in one and not the other)
+
+        # Due to (what appears to be) Rasterio errors, we have to remain large here: even though some reprojections
+        # are pretty good, some can get a bit nasty
+        # Check that little data (less than 1% of pixels) are significantly different
         ind_signif_diff = np.abs(reproj_arr - dst_arr) > 0.5
-        assert np.count_nonzero(ind_signif_diff) < 0.01 / 100 * reproj_arr.size
+        assert np.count_nonzero(ind_signif_diff) < 0.01 * reproj_arr.size
 
-        # The median difference is negligible compared to the amplitude of the signal (+/- 1 std)
-        assert np.nanmedian(np.abs(reproj_arr - dst_arr)) < 0.02
+        # The median difference should be negligible compared to the amplitude of the signal (+/- 1 std)
+        assert np.nanmedian(np.abs(reproj_arr - dst_arr)) < 0.1
 
         # # Replace with allclose once Rasterio issue fixed?
         # assert np.allclose(reproj_arr[~ind_both_nodata], dst_arr[~ind_both_nodata], atol=0.02)

From fafd2399d35ff22bf79bc631a9d805ab19a18a94 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 12:56:26 -0800
Subject: [PATCH 08/22] Linting

---
 geoutils/raster/delayed.py        |  91 ++++++++++------
 tests/test_raster/test_delayed.py | 169 +++++++++++++++++-------------
 2 files changed, 158 insertions(+), 102 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index bc118564..76221cee 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -2,7 +2,7 @@
 Module for dask-delayed functions for out-of-memory raster operations.
 """
 import warnings
-from typing import Any, Literal
+from typing import Any, Literal, TypeVar
 
 import dask.array as da
 import dask.delayed
@@ -12,6 +12,7 @@
 import rasterio as rio
 from scipy.interpolate import interpn
 
+from geoutils._typing import NDArrayNum
 from geoutils.projtools import _get_bounds_projected, _get_footprint_projected
 
 # 1/ SUBSAMPLING
@@ -23,7 +24,9 @@
 # usage by having to drop an axis and re-chunk along 1D of the 2D array, so we use the dask.delayed solution instead)
 
 
-def _random_state_from_user_input(random_state: np.random.RandomState | int | None = None) -> np.random.RandomState:
+def _random_state_from_user_input(
+    random_state: np.random.RandomState | int | None = None,
+) -> np.random.RandomState | np.random.Generator:
     """Define random state based on varied user input."""
 
     # Define state for random sampling (to fix results during testing)
@@ -61,7 +64,7 @@ def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids:
 
 
 def _get_indices_block_per_subsample(
-    xxs: np.ndarray, num_chunks: tuple[int, int], nb_valids_per_block: list[int]
+    xxs: NDArrayNum, num_chunks: tuple[int, int], nb_valids_per_block: list[int]
 ) -> list[list[int]]:
     """
     Get list of 1D valid subsample indices relative to the block for each block.
@@ -102,14 +105,14 @@ def _get_indices_block_per_subsample(
     return relative_ind_per_block
 
 
-@dask.delayed
-def _delayed_nb_valids(arr_chunk: np.ndarray) -> np.ndarray:
+@dask.delayed  # type: ignore
+def _delayed_nb_valids(arr_chunk: NDArrayNum) -> NDArrayNum:
     """Count number of valid values per block."""
     return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))
 
 
-@dask.delayed
-def _delayed_subsample_block(arr_chunk: np.ndarray, subsample_indices: np.ndarray) -> np.ndarray:
+@dask.delayed  # type: ignore
+def _delayed_subsample_block(arr_chunk: NDArrayNum, subsample_indices: NDArrayNum) -> NDArrayNum:
     """Subsample the valid values at the corresponding 1D valid indices per block."""
 
     s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
@@ -117,10 +120,10 @@ def _delayed_subsample_block(arr_chunk: np.ndarray, subsample_indices: np.ndarra
     return s_chunk
 
 
-@dask.delayed
+@dask.delayed  # type: ignore
 def _delayed_subsample_indices_block(
-    arr_chunk: np.ndarray, subsample_indices: np.ndarray, block_id: dict[str, Any]
-) -> np.ndarray:
+    arr_chunk: NDArrayNum, subsample_indices: NDArrayNum, block_id: dict[str, Any]
+) -> NDArrayNum:
     """Return 2D indices from the subsampled 1D valid indices per block."""
 
     # Unravel indices of valid data to the shape of the block
@@ -138,7 +141,7 @@ def delayed_subsample(
     subsample: int | float = 1,
     return_indices: bool = False,
     random_state: np.random.RandomState | int | None = None,
-) -> np.ndarray:
+) -> NDArrayNum | tuple[NDArrayNum, NDArrayNum]:
     """
     Subsample a raster at valid values on out-of-memory chunks.
 
@@ -246,7 +249,15 @@ def delayed_subsample(
 # Code structure inspired by https://blog.dask.org/2021/07/02/ragged-output and the "block_id" in map_blocks
 
 
-def _get_interp_indices_per_block(interp_x, interp_y, starts, num_chunks, chunksize, xres, yres):
+def _get_interp_indices_per_block(
+    interp_x: NDArrayNum,
+    interp_y: NDArrayNum,
+    starts: list[tuple[int, ...]],
+    num_chunks: tuple[int, int],
+    chunksize: tuple[int, int],
+    xres: float,
+    yres: float,
+) -> list[list[int]]:
     """Map blocks where each pair of interpolation coordinates will have to be computed."""
 
     # TODO 1: Check the robustness for chunksize different and X and Y
@@ -267,10 +278,10 @@ def _get_interp_indices_per_block(interp_x, interp_y, starts, num_chunks, chunks
     return ind_per_block
 
 
-@dask.delayed
+@dask.delayed  # type: ignore
 def _delayed_interp_points_block(
-    arr_chunk: np.ndarray, block_id: dict[str, Any], interp_coords: np.ndarray
-) -> np.ndarray:
+    arr_chunk: NDArrayNum, block_id: dict[str, Any], interp_coords: NDArrayNum
+) -> NDArrayNum:
     """
     Interpolate block in 2D out-of-memory for a regular or equal grid.
     """
@@ -296,7 +307,7 @@ def delayed_interp_points(
     points: tuple[list[float], list[float]],
     resolution: tuple[float, float],
     method: Literal["nearest", "linear", "cubic", "quintic"] = "linear",
-) -> np.ndarray:
+) -> NDArrayNum:
     """
     Interpolate raster at point coordinates on out-of-memory chunks.
 
@@ -313,8 +324,9 @@ def delayed_interp_points(
     :return: Array of raster value(s) for the given points.
     """
 
+    # TODO: Replace by a generic 2D point casting function accepting multiple inputs (living outside this function)
     # Convert input to 2D array
-    points = np.vstack((points[0], points[1]))
+    points_arr = np.vstack((points[0], points[1]))
 
     # Map depth of overlap required for each interpolation method
     # TODO: Double-check this window somewhere in SciPy's documentation
@@ -333,7 +345,7 @@ def delayed_interp_points(
 
     # Get samples indices per blocks
     ind_per_block = _get_interp_indices_per_block(
-        points[0, :], points[1, :], starts, num_chunks, chunksize, resolution[0], resolution[1]
+        points_arr[0, :], points_arr[1, :], starts, num_chunks, chunksize, resolution[0], resolution[1]
     )
 
     # Create a delayed object for each block, and flatten the blocks into a 1d shape
@@ -353,7 +365,7 @@ def delayed_interp_points(
 
     # Compute values delayed
     list_interp = [
-        _delayed_interp_points_block(blocks[i], block_ids[i], points[:, ind_per_block[i]])
+        _delayed_interp_points_block(blocks[i], block_ids[i], points_arr[:, ind_per_block[i]])
         for i, data_chunk in enumerate(blocks)
         if len(ind_per_block[i]) > 0
     ]
@@ -380,6 +392,9 @@ def delayed_interp_points(
 
 # We define a GeoGrid and GeoTiling class (which composes GeoGrid) to consistently deal with georeferenced footprints
 # of chunked grids
+GeoGridType = TypeVar("GeoGridType", bound="GeoGrid")
+
+
 class GeoGrid:
     """
     Georeferenced grid class.
@@ -430,13 +445,23 @@ def bounds_projected(self, crs: rio.crs.CRS = None) -> rio.coords.BoundingBox:
     def footprint(self) -> gpd.GeoDataFrame:
         return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=self.crs, densify_points=100)
 
-    def footprint_projected(self, crs: rio.crs.CRS = None):
+    def footprint_projected(self, crs: rio.crs.CRS = None) -> gpd.GeoDataFrame:
         if crs is None:
             crs = self.crs
         return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=crs, densify_points=100)
 
-    def shift(self, xoff: float, yoff: float, distance_unit: Literal["georeferenced"] | Literal["pixel"] = "pixel"):
-        """Shift geogrid, not inplace."""
+    @classmethod
+    def from_dict(cls: type[GeoGridType], dict_meta: dict[str, Any]) -> GeoGridType:
+        """Create a GeoGrid from a dictionary containing transform, shape and CRS."""
+        return cls(**dict_meta)
+
+    def shift(
+        self: GeoGridType,
+        xoff: float,
+        yoff: float,
+        distance_unit: Literal["georeferenced"] | Literal["pixel"] = "pixel",
+    ) -> GeoGridType:
+        """Shift into a new geogrid (not inplace)."""
 
         if distance_unit not in ["georeferenced", "pixel"]:
             raise ValueError("Argument 'distance_unit' should be either 'pixel' or 'georeferenced'.")
@@ -456,7 +481,7 @@ def shift(self, xoff: float, yoff: float, distance_unit: Literal["georeferenced"
 
         shifted_transform = rio.transform.Affine(dx, b, xmin + xoff, d, dy, ymax + yoff)
 
-        return GeoGrid(transform=shifted_transform, crs=self.crs, shape=self.shape)
+        return self.from_dict({"transform": shifted_transform, "crs": self.crs, "shape": self.shape})
 
 
 def _get_block_ids_per_chunk(chunks: tuple[tuple[int, ...], tuple[int, ...]]) -> list[dict[str, int]]:
@@ -536,7 +561,9 @@ def get_block_footprints(self, crs: rio.crs.CRS = None) -> gpd.GeoDataFrame:
         return pd.concat(footprints)
 
 
-def _chunks2d_from_chunksizes_shape(chunksizes: tuple[int, int], shape: tuple[int, int]):
+def _chunks2d_from_chunksizes_shape(
+    chunksizes: tuple[int, int], shape: tuple[int, int]
+) -> tuple[tuple[int, ...], tuple[int, ...]]:
     """Get tuples of chunk sizes for X/Y dimensions based on chunksizes and array shape."""
 
     # Chunksize is fixed, except for the last chunk depending on the shape
@@ -583,10 +610,10 @@ def _combined_blocks_shape_transform(
     return combined_meta, relative_block_indexes
 
 
-@dask.delayed
+@dask.delayed  # type: ignore
 def _delayed_reproject_per_block(
-    *src_arrs: tuple[np.ndarray], block_ids: list[dict[str, int]], combined_meta: dict[str, Any], **kwargs: Any
-) -> np.ndarray:
+    *src_arrs: tuple[NDArrayNum], block_ids: list[dict[str, int]], combined_meta: dict[str, Any], **kwargs: Any
+) -> NDArrayNum:
     """
     Delayed reprojection per destination block (also rebuilds a square array combined from intersecting source blocks).
     """
@@ -604,7 +631,7 @@ def _delayed_reproject_per_block(
     # Then fill it with the source chunks values
     for i, arr in enumerate(src_arrs):
         bid = block_ids[i]
-        comb_src_arr[bid["rys"]:bid["rye"], bid["rxs"]:bid["rxe"]] = arr
+        comb_src_arr[bid["rys"] : bid["rye"], bid["rxs"] : bid["rxe"]] = arr
 
     # Now, we can simply call Rasterio!
 
@@ -638,9 +665,9 @@ def delayed_reproject(
     dst_shape: tuple[int, int],
     dst_crs: rio.crs.CRS,
     resampling: rio.enums.Resampling,
-    src_nodata: int | float = None,
-    dst_nodata: int | float = None,
-    dst_chunksizes: tuple[int, int] = None,
+    src_nodata: int | float | None = None,
+    dst_nodata: int | float | None = None,
+    dst_chunksizes: tuple[int, int] | None = None,
     **kwargs: Any,
 ) -> da.Array:
     """
@@ -679,6 +706,8 @@ def delayed_reproject(
     src_geotiling = ChunkedGeoGrid(grid=src_geogrid, chunks=src_chunks)
 
     # For destination, we need to create the chunks based on destination chunksizes
+    if dst_chunksizes is None:
+        dst_chunksizes = darr.chunksize
     dst_chunks = _chunks2d_from_chunksizes_shape(chunksizes=dst_chunksizes, shape=dst_shape)
     dst_geotiling = ChunkedGeoGrid(grid=dst_geogrid, chunks=dst_chunks)
 
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index f0c9c142..e387ad4a 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,7 +1,7 @@
 """Tests for dask-delayed functions."""
 import os
 from tempfile import NamedTemporaryFile
-from typing import Callable, Any
+from typing import Any, Callable
 
 import dask.array as da
 import numpy as np
@@ -9,8 +9,10 @@
 import pytest
 import rasterio as rio
 import xarray as xr
-from pyproj import CRS
 from dask.distributed import Client, LocalCluster
+from dask_memusage import install
+from pluggy import PluggyTeardownRaisedWarning
+from pyproj import CRS
 
 from geoutils.examples import _EXAMPLES_DIRECTORY
 from geoutils.raster.delayed import (
@@ -19,29 +21,30 @@
     delayed_subsample,
 )
 
-from pluggy import PluggyTeardownRaisedWarning
-from dask_memusage import install
-
 # Ignore teardown warning given by Dask when closing the local cluster (due to dask-memusage plugin)
 pytestmark = pytest.mark.filterwarnings("ignore", category=PluggyTeardownRaisedWarning)
 
-# Fixture to use a single cluster for the entire module
-@pytest.fixture(scope='module')
+
+@pytest.fixture(scope="module")  # type: ignore
 def cluster():
+    """Fixture to use a single cluster for the entire module (otherwise raise runtime errors)."""
     # Need cluster to be single-threaded to use dask-memusage confidently
     dask_cluster = LocalCluster(n_workers=2, threads_per_worker=1, dashboard_address=None)
     yield dask_cluster
     dask_cluster.close()
 
-def _run_dask_measuring_memusage(cluster, dask_func: Callable, *args_dask_func: Any, **kwargs_dask_func: Any) -> tuple[Any, float]:
+
+def _run_dask_measuring_memusage(
+    cluster: Any, dask_func: Callable[..., Any], *args_dask_func: Any, **kwargs_dask_func: Any
+) -> tuple[Any, float]:
     """Run a dask function monitoring its memory usage."""
 
-    # Create a name temporary file that won't delete immediatley
+    # Create a name temporary file that won't delete immediately
     fn_tmp_csv = NamedTemporaryFile(suffix=".csv", delete=False).name
 
     # Setup cluster and client within context managers for a clean shutdown
     install(cluster.scheduler, fn_tmp_csv)
-    with Client(cluster) as client:
+    with Client(cluster) as _:
         outputs = dask_func(*args_dask_func, **kwargs_dask_func)
 
     # Read memusage file and cleanup
@@ -56,6 +59,7 @@ def _run_dask_measuring_memusage(cluster, dask_func: Callable, *args_dask_func:
 
     return outputs, memusage_mb
 
+
 def _estimate_subsample_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int) -> float:
     """
     Estimate the theoretical memory usage of the delayed subsampling method.
@@ -87,12 +91,13 @@ def _estimate_subsample_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
     meta_memusage = list_per_block + list_all_blocks
 
     # Final estimate of memory usage of operation in MB
-    max_op_memusage = fac_dask_margin * (chunk_memusage + sample_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    max_op_memusage = fac_dask_margin * (chunk_memusage + sample_memusage + out_memusage + meta_memusage) / (2**20)
     # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
     max_op_memusage += 50 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 
+
 def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], ninterp: int) -> float:
     """
     Estimate the theoretical memory usage of the delayed interpolation method.
@@ -125,14 +130,19 @@ def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[in
     meta_memusage = list_per_block + list_all_blocks + dict_all_blocks
 
     # Final estimate of memory usage of operation in MB
-    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2**20)
     # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
     max_op_memusage += 50 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 
-def _estimate_reproject_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, int], dst_chunksizes: tuple[int, int],
-                                 rel_res_fac: tuple[float, float]) -> float:
+
+def _estimate_reproject_memusage(
+    darr: da.Array,
+    chunksizes_in_mem: tuple[int, int],
+    dst_chunksizes: tuple[int, int],
+    rel_res_fac: tuple[float, float],
+) -> float:
     """
     Estimate the theoretical memory usage of the delayed reprojection method.
     (we don't need to be super precise, just within a factor of ~2 to check memory usage performs as expected)
@@ -153,7 +163,7 @@ def _estimate_reproject_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
     nb_source_chunks_per_dest = 8 * x_rel_source_chunks * y_rel_source_chunks
 
     # Combined memory usage of one chunk operation = squared array made from combined chunksize + original chunks
-    total_nb = np.ceil(np.sqrt(nb_source_chunks_per_dest))**2 + nb_source_chunks_per_dest
+    total_nb = np.ceil(np.sqrt(nb_source_chunks_per_dest)) ** 2 + nb_source_chunks_per_dest
     # We multiply the memory usage of a single chunk to the number of loaded/combined chunks
     chunk_memusage = darr.dtype.itemsize * np.prod(chunksizes_in_mem) * total_nb
 
@@ -170,19 +180,21 @@ def _estimate_reproject_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
     meta_memusage = combined_meta + dict_all_blocks
 
     # Final estimate of memory usage of operation in MB
-    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2 ** 20)
+    max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2**20)
     # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
     max_op_memusage += 50 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 
 
-def _build_dst_transform_shifted_newres(src_transform: rio.transform.Affine,
-                                        src_shape: tuple[int, int],
-                                        src_crs: CRS,
-                                        dst_crs: CRS,
-                                        bounds_rel_shift: tuple[float, float],
-                                        res_rel_fac: tuple[float, float]) -> rio.transform.Affine:
+def _build_dst_transform_shifted_newres(
+    src_transform: rio.transform.Affine,
+    src_shape: tuple[int, int],
+    src_crs: CRS,
+    dst_crs: CRS,
+    bounds_rel_shift: tuple[float, float],
+    res_rel_fac: tuple[float, float],
+) -> rio.transform.Affine:
     """
     Build a destination transform intersecting the source transform given source/destination shapes,
     and possibly introducing a relative shift in upper-left bound and multiplicative change in resolution.
@@ -212,10 +224,12 @@ def _build_dst_transform_shifted_newres(src_transform: rio.transform.Affine,
         west=tmp_bounds.left + bounds_rel_shift[0] * tmp_res[0] * src_shape[1],
         north=tmp_bounds.top + 150 * bounds_rel_shift[0] * tmp_res[1] * src_shape[0],
         xsize=tmp_res[0] / res_rel_fac[0],
-        ysize=tmp_res[1] / res_rel_fac[1])
+        ysize=tmp_res[1] / res_rel_fac[1],
+    )
 
     return dst_transform
 
+
 class TestDelayed:
     """
     Testing delayed functions is pretty straightforward.
@@ -229,12 +243,11 @@ class TestDelayed:
         subsample and interp_points, or destination chunksizes to map output of reproject).
     2. During execution, we capture memory usage and check that only the expected amount of memory
         (one or several chunk combinations + metadata) is indeed used during the compute() call.
-     """
+    """
 
     # Write big test files on disk out-of-memory, with different input shapes not necessarily aligned between themselves
     # or with chunks
-    fn_nc_shape = {"test_square.nc": (10000, 10000),
-                   "test_complex.nc": (5511, 6768)}
+    fn_nc_shape = {"test_square.nc": (10000, 10000), "test_complex.nc": (5511, 6768)}
     # We can use a constant value for storage chunks, it doesn't have any influence on the accuracy of delayed methods
     # (can change slightly RAM usage, but pretty stable as long as chunksizes in memory are larger and
     # significantly bigger)
@@ -254,9 +267,9 @@ class TestDelayed:
             writer = ds.to_netcdf(fn, encoding=encoding_kwargs, compute=False)
             writer.compute()
 
-    @pytest.mark.parametrize("fn", list_fn)
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
-    @pytest.mark.parametrize("subsample_size", [100, 100000])
+    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
+    @pytest.mark.parametrize("subsample_size", [100, 100000])  # type: ignore
     def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any):
         """
         Checks for delayed subsampling function, both for output and memory usage.
@@ -272,15 +285,17 @@ def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], su
 
         # 1/ Estimation of theoretical memory usage of the subsampling script
 
-        max_op_memusage = _estimate_subsample_memusage(darr=darr, chunksizes_in_mem=chunksizes_in_mem,
-                                                       subsample_size=subsample_size)
+        max_op_memusage = _estimate_subsample_memusage(
+            darr=darr, chunksizes_in_mem=chunksizes_in_mem, subsample_size=subsample_size
+        )
 
         # 2/ Run delayed subsample with dask memory usage monitoring
 
         # Derive subsample from delayed function
         # (passed to wrapper function to measure memory usage during execution)
-        sub, measured_op_memusage = _run_dask_measuring_memusage(cluster, delayed_subsample, darr,
-                                                                 subsample=subsample_size, random_state=42)
+        sub, measured_op_memusage = _run_dask_measuring_memusage(
+            cluster, delayed_subsample, darr, subsample=subsample_size, random_state=42
+        )
 
         # Check the measured memory usage is smaller than the maximum estimated one
         assert measured_op_memusage < max_op_memusage
@@ -297,9 +312,9 @@ def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], su
         sub2 = np.array(darr.vindex[indices[0], indices[1]])
         assert np.array_equal(sub, sub2)
 
-    @pytest.mark.parametrize("fn", list_fn)
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
-    @pytest.mark.parametrize("ninterp", [100, 100000])
+    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
+    @pytest.mark.parametrize("ninterp", [100, 100000])  # type: ignore
     def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any):
         """
         Checks for delayed interpolate points function.
@@ -318,14 +333,15 @@ def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int]
         interp_y = rng.choice(ds.y.size, ninterp) + rng.random(ninterp)
 
         # 1/ Estimation of theoretical memory usage of the subsampling script
-        max_op_memusage = _estimate_interp_points_memusage(darr=darr, chunksizes_in_mem=chunksizes_in_mem,
-                                                           ninterp=ninterp)
-
+        max_op_memusage = _estimate_interp_points_memusage(
+            darr=darr, chunksizes_in_mem=chunksizes_in_mem, ninterp=ninterp
+        )
 
         # 2/ Run interpolation of random point coordinates with memory monitoring
 
-        interp1, measured_op_memusage = _run_dask_measuring_memusage(cluster, delayed_interp_points, darr,
-                                                                     points=(interp_x, interp_y), resolution=(1, 1))
+        interp1, measured_op_memusage = _run_dask_measuring_memusage(
+            cluster, delayed_interp_points, darr, points=(interp_x, interp_y), resolution=(1, 1)
+        )
         # Check the measured memory usage is smaller than the maximum estimated one
         assert measured_op_memusage < max_op_memusage
 
@@ -340,19 +356,25 @@ def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int]
 
         assert np.array_equal(interp1, interp2, equal_nan=True)
 
-    @pytest.mark.parametrize("fn", list_fn)
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])
-    @pytest.mark.parametrize("dst_chunksizes", [(2000, 2000), (1398, 2983)])
+    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
+    @pytest.mark.parametrize("dst_chunksizes", [(2000, 2000), (1398, 2983)])  # type: ignore
     # Shift upper left corner of output bounds (relative to projected input bounds) by fractions of the raster size
-    @pytest.mark.parametrize("dst_bounds_rel_shift", [(0, 0), (-0.2, 0.5)])
+    @pytest.mark.parametrize("dst_bounds_rel_shift", [(0, 0), (-0.2, 0.5)])  # type: ignore
     # Modify output resolution (relative to projected input resolution) by a factor
-    @pytest.mark.parametrize("dst_res_rel_fac", [(1, 1), (2.1, 0.54)])
+    @pytest.mark.parametrize("dst_res_rel_fac", [(1, 1), (2.1, 0.54)])  # type: ignore
     # Same for shape
-    @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])
-    def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
-                               dst_chunksizes: tuple[int, int], dst_bounds_rel_shift: tuple[float, float],
-                               dst_res_rel_fac: tuple[float, float], dst_shape_diff: tuple[int, int],
-                               cluster: Any):
+    @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])  # type: ignore
+    def test_delayed_reproject(
+        self,
+        fn: str,
+        chunksizes_in_mem: tuple[int, int],
+        dst_chunksizes: tuple[int, int],
+        dst_bounds_rel_shift: tuple[float, float],
+        dst_res_rel_fac: tuple[float, float],
+        dst_shape_diff: tuple[int, int],
+        cluster: Any,
+    ):
         """
         Checks for the delayed reproject function.
         Variables that influence specifically the delayed function are:
@@ -393,14 +415,20 @@ def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
         resampling = rio.enums.Resampling.bilinear
 
         # Get shifted dst_transform with new resolution
-        dst_transform = _build_dst_transform_shifted_newres(src_transform=src_transform, src_crs=src_crs, dst_crs=dst_crs,
-                                                            src_shape=src_shape, bounds_rel_shift=dst_bounds_rel_shift,
-                                                            res_rel_fac=dst_res_rel_fac)
+        dst_transform = _build_dst_transform_shifted_newres(
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_crs=dst_crs,
+            src_shape=src_shape,
+            bounds_rel_shift=dst_bounds_rel_shift,
+            res_rel_fac=dst_res_rel_fac,
+        )
 
         # 1/ Estimation of theoretical memory usage of the subsampling script
 
-        max_op_memusage = _estimate_reproject_memusage(darr, chunksizes_in_mem=chunksizes_in_mem, dst_chunksizes=dst_chunksizes,
-                                                       rel_res_fac=dst_res_rel_fac)
+        max_op_memusage = _estimate_reproject_memusage(
+            darr, chunksizes_in_mem=chunksizes_in_mem, dst_chunksizes=dst_chunksizes, rel_res_fac=dst_res_rel_fac
+        )
 
         # 2/ Run delayed reproject with memory monitoring
 
@@ -408,7 +436,7 @@ def test_delayed_reproject(self, fn: str, chunksizes_in_mem: tuple[int, int],
         # (delayed_reproject returns a delayed array that might not fit in memory, unlike subsampling/interpolation)
         fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, os.path.splitext(os.path.basename(fn))[0] + "_reproj.nc")
 
-        def reproject_and_write(*args, **kwargs):
+        def reproject_and_write(*args: Any, **kwargs: Any) -> None:
 
             # Run delayed reprojection
             reproj_arr_tmp = delayed_reproject(*args, **kwargs)
@@ -456,20 +484,19 @@ def reproject_and_write(*args, **kwargs):
 
         # Keeping this to visualize Rasterio resampling issue
         # if PLOT:
-            # import matplotlib.pyplot as plt
-            # plt.figure()
-            # plt.imshow((reproj_arr - dst_arr), cmap="RdYlBu", vmin=-0.2, vmax=0.2, interpolation="None")
-            # plt.colorbar()
-            # plt.savefig("/home/atom/ongoing/diff_close_zero.png", dpi=500)
-            # plt.figure()
-            # plt.imshow(np.abs(reproj_arr - dst_arr), cmap="RdYlBu", vmin=99997, vmax=100001, interpolation="None")
-            # plt.colorbar()
-            # plt.savefig("/home/atom/ongoing/diff_nodata.png", dpi=500)
-            # plt.figure()
-            # plt.imshow(dst_arr, cmap="RdYlBu", vmin=-1, vmax=1, interpolation="None")
-            # plt.colorbar()
-            # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
-
+        # import matplotlib.pyplot as plt
+        # plt.figure()
+        # plt.imshow((reproj_arr - dst_arr), cmap="RdYlBu", vmin=-0.2, vmax=0.2, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/diff_close_zero.png", dpi=500)
+        # plt.figure()
+        # plt.imshow(np.abs(reproj_arr - dst_arr), cmap="RdYlBu", vmin=99997, vmax=100001, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/diff_nodata.png", dpi=500)
+        # plt.figure()
+        # plt.imshow(dst_arr, cmap="RdYlBu", vmin=-1, vmax=1, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
 
         # Due to (what appears to be) Rasterio errors, we have to remain large here: even though some reprojections
         # are pretty good, some can get a bit nasty

From 2e5b69fa400c5269f37abd183da498644b40078d Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 14:07:04 -0800
Subject: [PATCH 09/22] The future is not here yet

---
 geoutils/raster/delayed.py        | 1 +
 tests/test_raster/test_delayed.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 76221cee..9c150762 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -1,6 +1,7 @@
 """
 Module for dask-delayed functions for out-of-memory raster operations.
 """
+from __future__ import annotations
 import warnings
 from typing import Any, Literal, TypeVar
 
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index e387ad4a..85800947 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,4 +1,5 @@
 """Tests for dask-delayed functions."""
+from __future__ import annotations
 import os
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable

From 7225f57273a1ca5f7beb71d8f33a6b160191565e Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 14:09:53 -0800
Subject: [PATCH 10/22] Linting

---
 geoutils/raster/delayed.py        | 1 +
 tests/test_raster/test_delayed.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 9c150762..07da19d1 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -2,6 +2,7 @@
 Module for dask-delayed functions for out-of-memory raster operations.
 """
 from __future__ import annotations
+
 import warnings
 from typing import Any, Literal, TypeVar
 
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 85800947..a8d19c53 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,5 +1,6 @@
 """Tests for dask-delayed functions."""
 from __future__ import annotations
+
 import os
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable

From 9e141602c97591486e100dadef5d55500422a0ab Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 17:48:31 -0800
Subject: [PATCH 11/22] Split output and memory tests and re-order them for
 speed

---
 tests/test_raster/test_delayed.py | 410 ++++++++++++++++++++----------
 1 file changed, 274 insertions(+), 136 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index a8d19c53..1478efec 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -31,7 +31,7 @@
 def cluster():
     """Fixture to use a single cluster for the entire module (otherwise raise runtime errors)."""
     # Need cluster to be single-threaded to use dask-memusage confidently
-    dask_cluster = LocalCluster(n_workers=2, threads_per_worker=1, dashboard_address=None)
+    dask_cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
     yield dask_cluster
     dask_cluster.close()
 
@@ -234,53 +234,260 @@ def _build_dst_transform_shifted_newres(
 
 class TestDelayed:
     """
-    Testing delayed functions is pretty straightforward.
+    Testing class for delayed functions.
 
-    We test on rasters big enough to clearly monitor the memory usage, but small enough to fit in-memory to check
-    the function outputs against the ones from in-memory methods.
+    We test on a first set of rasters big enough to clearly monitor the memory usage, and a second set small enough
+    to run fast to check a wide range of input parameters.
 
     In details:
-    1. We compare with the in-memory function output only for the set of input variables that influence the delayed
+    Set 1. We capture memory usage during the .compute() calls and check that only the expected amount of memory that
+        we estimate independently (bytes used by one or several chunk combinations + metadata) is indeed used.
+    Set 2. We compare the outputs with the in-memory function specifically for input variables that influence the delayed
         algorithm and might lead to new errors (for example: array shape to get subsample/points locations for
         subsample and interp_points, or destination chunksizes to map output of reproject).
-    2. During execution, we capture memory usage and check that only the expected amount of memory
-        (one or several chunk combinations + metadata) is indeed used during the compute() call.
+
+    We start with output checks which run faster when ordered before (maybe due to the cluster memory monitoring after).
     """
 
-    # Write big test files on disk out-of-memory, with different input shapes not necessarily aligned between themselves
-    # or with chunks
-    fn_nc_shape = {"test_square.nc": (10000, 10000), "test_complex.nc": (5511, 6768)}
-    # We can use a constant value for storage chunks, it doesn't have any influence on the accuracy of delayed methods
-    # (can change slightly RAM usage, but pretty stable as long as chunksizes in memory are larger and
+    # Define random seed for generating test data
+    rng = da.random.default_rng(seed=42)
+
+    # 1/ Set 1: Memory usage checks
+
+    # Big test files written on disk in an out-of-memory fashion,
+    # with different input shapes not necessarily aligned between themselves
+    large_shape = (10000, 10000)
+    # We can use a constant value for storage chunks, as it doesn't have any influence on the accuracy of delayed
+    # methods (can change slightly RAM usage, but pretty stable as long as chunksizes in memory are larger and
     # significantly bigger)
-    chunksizes_on_disk = (200, 200)
-    list_fn = []
-    for fn_basename in fn_nc_shape.keys():
-        fn = os.path.join(_EXAMPLES_DIRECTORY, fn_basename)
-        list_fn.append(fn)
-        if not os.path.exists(fn):
-            # Create random array in the right shape
-            test_shape = fn_nc_shape[fn_basename]
-            rng = da.random.default_rng(seed=42)
-            data = rng.normal(size=test_shape[0] * test_shape[1]).reshape(test_shape[0], test_shape[1])
-            data_arr = xr.DataArray(data=data, dims=["x", "y"])
-            ds = xr.Dataset(data_vars={"test": data_arr})
-            encoding_kwargs = {"test": {"chunksizes": chunksizes_on_disk}}
-            writer = ds.to_netcdf(fn, encoding=encoding_kwargs, compute=False)
-            writer.compute()
-
-    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
-    @pytest.mark.parametrize("subsample_size", [100, 100000])  # type: ignore
-    def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any):
+    chunksizes_on_disk = (500, 500)
+    fn_large = os.path.join(_EXAMPLES_DIRECTORY, "test_large.nc")
+    if not os.path.exists(fn_large):
+        # Create random array in the right shape
+        data = rng.normal(size=large_shape[0] * large_shape[1]).reshape(large_shape[0], large_shape[1])
+        data_arr = xr.DataArray(data=data, dims=["x", "y"])
+        ds = xr.Dataset(data_vars={"test": data_arr})
+        encoding_kwargs = {"test": {"chunksizes": chunksizes_on_disk}}
+        # Write to disk out-of-memory
+        writer = ds.to_netcdf(fn_large, encoding=encoding_kwargs, compute=False)
+        writer.compute()
+
+    # 2. Set 2
+    # Smaller test files for fast checks, with various shapes and with/without nodata
+    list_small_shapes = [(50, 50), (51, 47)]
+    with_nodata = [False, True]
+    list_small_darr = []
+    for small_shape in list_small_shapes:
+        for w in with_nodata:
+            small_darr = rng.normal(size=small_shape[0] * small_shape[1])
+            # Add about half nodata values
+            if w:
+                ind_nodata = rng.choice(small_darr.size, size=int(small_darr.size/2), replace=False)
+                small_darr[list(ind_nodata)] = np.nan
+            small_darr = small_darr.reshape(small_shape[0], small_shape[1])
+            list_small_darr.append(small_darr)
+
+    # List of in-memory chunksize for small tests
+    list_small_chunksizes_in_mem = [(10, 10), (7, 19)]
+
+    @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
+    @pytest.mark.parametrize("subsample_size", [2, 100, 100000])  # type: ignore
+    def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int):
         """
-        Checks for delayed subsampling function, both for output and memory usage.
-        Variables that influence specifically the delayed function are:
+        Checks for delayed subsampling function for output accuracy.
+        Variables that influence specifically the delayed function and might lead to new errors are:
         - Input chunksizes,
         - Input array shape,
         - Number of subsampled points.
         """
 
+        # 1/ We run the delayed function after re-chunking
+        darr = darr.rechunk(chunksizes_in_mem)
+        sub = delayed_subsample(darr, subsample=subsample_size, random_state=42)
+
+        # 2/ Output checks
+
+        # # The subsample should have exactly the prescribed length, with only valid values
+        assert len(sub) == min(subsample_size, np.count_nonzero(np.isfinite(darr)))
+        assert all(np.isfinite(sub))
+
+        # To verify the sampling works correctly, we can get its subsample indices with the argument return_indices
+        # And compare to the same subsample with vindex (now that we know the coordinates of valid values sampled)
+        indices = delayed_subsample(darr, subsample=subsample_size, random_state=42, return_indices=True)
+        sub2 = np.array(darr.vindex[indices[0], indices[1]])
+        assert np.array_equal(sub, sub2)
+
+    @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
+    @pytest.mark.parametrize("ninterp", [2, 100])  # type: ignore
+    @pytest.mark.parametrize("res", [(0.5, 2), (1, 1)])  # type: ignore
+    def test_delayed_interp_points__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], ninterp: int,
+                                           res: tuple[float, float]):
+        """
+        Checks for delayed interpolate points function.
+        Variables that influence specifically the delayed function are:
+        - Input chunksizes,
+        - Input array shape,
+        - Number of interpolated points,
+        - The resolution of the regular grid.
+        """
+
+        # 1/ Define points to interpolate given the size and resolution
+        darr = darr.rechunk(chunksizes_in_mem)
+        rng = np.random.default_rng(seed=42)
+        interp_x = (rng.choice(darr.shape[0], ninterp) + rng.random(ninterp)) * res[0]
+        interp_y = (rng.choice(darr.shape[1], ninterp) + rng.random(ninterp)) * res[1]
+
+        interp1 = delayed_interp_points(darr, points=(interp_x, interp_y), resolution=res)
+
+        # 2/ Output checks
+
+        # Interpolate directly with Xarray (loads a lot in memory) and check results are exactly the same
+        xx = xr.DataArray(interp_x, dims="z", name="x")
+        yy = xr.DataArray(interp_y, dims="z", name="y")
+        ds = xr.DataArray(data=darr, dims=["x", "y"])
+        interp2 = ds.interp(x=xx, y=yy)
+        interp2.compute()
+        interp2 = np.array(interp2.values)
+
+        assert np.array_equal(interp1, interp2, equal_nan=True)
+
+    @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
+    @pytest.mark.parametrize("dst_chunksizes", list_small_chunksizes_in_mem)  # type: ignore
+    # Shift upper left corner of output bounds (relative to projected input bounds) by fractions of the raster size
+    @pytest.mark.parametrize("dst_bounds_rel_shift", [(0, 0), (-0.2, 0.5)])  # type: ignore
+    # Modify output resolution (relative to projected input resolution) by a factor
+    @pytest.mark.parametrize("dst_res_rel_fac", [(1, 1), (2.1, 0.54)])  # type: ignore
+    # Same for shape
+    @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])  # type: ignore
+    def test_delayed_reproject__output(
+            self,
+            darr: da.Array,
+            chunksizes_in_mem: tuple[int, int],
+            dst_chunksizes: tuple[int, int],
+            dst_bounds_rel_shift: tuple[float, float],
+            dst_res_rel_fac: tuple[float, float],
+            dst_shape_diff: tuple[int, int],
+    ):
+        """
+        Checks for the delayed reproject function.
+        Variables that influence specifically the delayed function are:
+        - Input/output chunksizes,
+        - Input array shape,
+        - Output geotransform relative to projected input geotransform,
+        - Output array shape relative to input.
+        """
+
+        # Keeping this commented here if we need to redo local tests due to Rasterio errors
+        # fn = list_fn[0]
+        # chunksizes_in_mem = (2000, 2000)
+        # dst_chunksizes = (1398, 2983)  # (2000, 2000)
+        # dst_bounds_rel_shift = (0, 0)
+        # dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
+        # dst_shape_diff = (0, 0)
+        # cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
+
+        # 0/ Define input parameters
+
+        # Get input and output shape
+        src_shape = darr.shape
+        dst_shape = (src_shape[0] + dst_shape_diff[0], src_shape[1] + dst_shape_diff[1])
+
+        # Define arbitrary input transform, as we only care about the relative difference with the output transform
+        src_transform = rio.transform.from_bounds(10, 10, 15, 15, src_shape[0], src_shape[1])
+
+        # Other arguments having (normally) no influence
+        src_crs = CRS(4326)
+        dst_crs = CRS(32630)
+        src_nodata = -9999
+        dst_nodata = 99999
+        resampling = rio.enums.Resampling.bilinear
+
+        # Get shifted dst_transform with new resolution
+        dst_transform = _build_dst_transform_shifted_newres(
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_crs=dst_crs,
+            src_shape=src_shape,
+            bounds_rel_shift=dst_bounds_rel_shift,
+            res_rel_fac=dst_res_rel_fac,
+        )
+
+        # 2/ Run delayed reproject with memory monitoring
+
+        reproj_arr = delayed_reproject(
+            darr,
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_transform=dst_transform,
+            dst_crs=dst_crs,
+            dst_shape=dst_shape,
+            src_nodata=src_nodata,
+            dst_nodata=dst_nodata,
+            resampling=resampling,
+            dst_chunksizes=dst_chunksizes, )
+
+        # 3/ Outputs check: load in memory and compare with a direct Rasterio reproject
+        reproj_arr = np.array(reproj_arr)
+
+        dst_arr = np.zeros(dst_shape)
+        _ = rio.warp.reproject(
+            np.array(darr),
+            dst_arr,
+            src_transform=src_transform,
+            src_crs=src_crs,
+            dst_transform=dst_transform,
+            dst_crs=dst_crs,
+            resampling=resampling,
+            src_nodata=src_nodata,
+            dst_nodata=dst_nodata,
+        )
+
+        # Keeping this to visualize Rasterio resampling issue
+        # if PLOT:
+        # import matplotlib.pyplot as plt
+        # plt.figure()
+        # plt.imshow((reproj_arr - dst_arr), cmap="RdYlBu", vmin=-0.2, vmax=0.2, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/diff_close_zero.png", dpi=500)
+        # plt.figure()
+        # plt.imshow(np.abs(reproj_arr - dst_arr), cmap="RdYlBu", vmin=99997, vmax=100001, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/diff_nodata.png", dpi=500)
+        # plt.figure()
+        # plt.imshow(dst_arr, cmap="RdYlBu", vmin=-1, vmax=1, interpolation="None")
+        # plt.colorbar()
+        # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
+
+        # Due to (what appears to be) Rasterio errors, we have to remain unprecise for the checks here:
+        # even though some reprojections are pretty good, some can get a bit nasty
+
+        # Check that little data (less than 10% of pixels) are significantly different
+        ind_signif_diff = np.abs(reproj_arr - dst_arr) > 0.5
+        assert np.count_nonzero(ind_signif_diff) < 0.1 * reproj_arr.size
+
+        # The median difference should be negligible compared to the amplitude of the signal (+/- 1 std)
+        assert np.nanmedian(np.abs(reproj_arr - dst_arr)) < 0.1
+
+        # # Replace with allclose once Rasterio issue fixed? For some cases we get a good match
+        # (less than 0.01 for all pixels)
+        # assert np.allclose(reproj_arr[~ind_both_nodata], dst_arr[~ind_both_nodata], atol=0.02)
+
+    @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
+    @pytest.mark.parametrize("subsample_size", [100, 100000])  # type: ignore
+    def test_delayed_subsample__memusage(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int,
+                                         cluster: Any):
+        """
+        Checks for delayed subsampling function for memory usage on big file.
+        (and also runs output checks as not long or too memory intensive in this case)
+        Variables that influence memory usage are:
+        - Subsample sizes,
+        - Chunksizes in memory.
+        """
+
         # 0/ Open dataset with chunks
         ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
         darr = ds["test"].data
@@ -303,8 +510,7 @@ def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], su
         assert measured_op_memusage < max_op_memusage
 
         # 3/ Output checks
-
-        # # The subsample should have exactly the prescribed length, with only valid values
+        # The subsample should have exactly the prescribed length, with only valid values
         assert len(sub) == subsample_size
         assert all(np.isfinite(sub))
 
@@ -314,16 +520,16 @@ def test_delayed_subsample(self, fn: str, chunksizes_in_mem: tuple[int, int], su
         sub2 = np.array(darr.vindex[indices[0], indices[1]])
         assert np.array_equal(sub, sub2)
 
-    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
-    @pytest.mark.parametrize("ninterp", [100, 100000])  # type: ignore
-    def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any):
+    @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
+    @pytest.mark.parametrize("ninterp", [100, 1000000])  # type: ignore
+    def test_delayed_interp_points__memusage(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int,
+                                             cluster: Any):
         """
-        Checks for delayed interpolate points function.
-        Variables that influence specifically the delayed function are:
-        - Input chunksizes,
-        - Input array shape,
-        - Number of interpolated points.
+        Checks for delayed interpolate points function for memory usage on a big file.
+        Variables that influence memory usage are:
+        - Number of interpolated points,
+        - Chunksizes in memory.
         """
 
         # 0/ Open dataset with chunks and create random point locations to interpolate
@@ -347,53 +553,30 @@ def test_delayed_interp_points(self, fn: str, chunksizes_in_mem: tuple[int, int]
         # Check the measured memory usage is smaller than the maximum estimated one
         assert measured_op_memusage < max_op_memusage
 
-        # 3/ Output checks
-
-        # Interpolate directly with Xarray (loads a lot in memory) and check results are exactly the same
-        xx = xr.DataArray(interp_x, dims="z", name="x")
-        yy = xr.DataArray(interp_y, dims="z", name="y")
-        interp2 = ds.test.interp(x=xx, y=yy)
-        interp2.compute()
-        interp2 = np.array(interp2.values)
-
-        assert np.array_equal(interp1, interp2, equal_nan=True)
-
-    @pytest.mark.parametrize("fn", list_fn)  # type: ignore
-    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000), (1241, 3221)])  # type: ignore
-    @pytest.mark.parametrize("dst_chunksizes", [(2000, 2000), (1398, 2983)])  # type: ignore
-    # Shift upper left corner of output bounds (relative to projected input bounds) by fractions of the raster size
-    @pytest.mark.parametrize("dst_bounds_rel_shift", [(0, 0), (-0.2, 0.5)])  # type: ignore
-    # Modify output resolution (relative to projected input resolution) by a factor
-    @pytest.mark.parametrize("dst_res_rel_fac", [(1, 1), (2.1, 0.54)])  # type: ignore
-    # Same for shape
-    @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])  # type: ignore
-    def test_delayed_reproject(
-        self,
-        fn: str,
-        chunksizes_in_mem: tuple[int, int],
-        dst_chunksizes: tuple[int, int],
-        dst_bounds_rel_shift: tuple[float, float],
-        dst_res_rel_fac: tuple[float, float],
-        dst_shape_diff: tuple[int, int],
-        cluster: Any,
+    @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
+    @pytest.mark.parametrize("dst_chunksizes", [(1000, 1000), (2500, 2500)])  # type: ignore
+    @pytest.mark.parametrize("dst_bounds_rel_shift", [(1, 1), (2, 2)])  # type: ignore
+    def test_delayed_reproject__memusage(
+            self,
+            fn: str,
+            chunksizes_in_mem: tuple[int, int],
+            dst_chunksizes: tuple[int, int],
+            dst_bounds_rel_shift: tuple[float, float],
+            cluster: Any,
     ):
         """
-        Checks for the delayed reproject function.
-        Variables that influence specifically the delayed function are:
-        - Input/output chunksizes,
-        - Input array shape,
-        - Output geotransform relative to projected input geotransform,
-        - Output array shape relative to input.
+        Checks for the delayed reproject function for memory usage on a big file.
+        Variables that influence memory usage are:
+        - Source chunksizes in memory,
+        - Destination chunksizes in memory,
+        - Relative difference in resolution (potentially more/less source chunks to load for a destination chunk).
         """
 
-        # Keeping this commented here if we need to redo local tests due to Rasterio errors
-        # fn = list_fn[0]
-        # chunksizes_in_mem = (2000, 2000)
-        # dst_chunksizes = (1398, 2983)  # (2000, 2000)
-        # dst_bounds_rel_shift = (0, 0)
-        # dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
-        # dst_shape_diff = (0, 0)
-        # cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
+        # We fix arbitrary changes to the destination shape/resolution/bounds
+        # (already checked in details in the output tests)
+        dst_shape_diff = (25, -25)
+        dst_res_rel_fac = (1.5, 0.5)
 
         # 0/ Open dataset with chunks and define variables
         ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
@@ -401,14 +584,14 @@ def test_delayed_reproject(
 
         # Get input and output shape
         src_shape = darr.shape
-        dst_shape = (src_shape[0] + dst_shape_diff[0], src_shape[1] + dst_shape_diff[1])
+        dst_shape = (src_shape[0], src_shape[1] + dst_shape_diff[1])
 
         # Define arbitrary input/output CRS, they don't have a direct influence on the delayed method
         # (as long as the input/output transforms intersect if projected in the same CRS)
         src_crs = CRS(4326)
         dst_crs = CRS(32630)
 
-        # Define arbitrary input transform, as we only care about the relative difference of the output transform
+        # Define arbitrary input transform, as we only care about the relative difference with the output transform
         src_transform = rio.transform.from_bounds(10, 10, 15, 15, src_shape[0], src_shape[1])
 
         # Other arguments having no influence
@@ -439,7 +622,6 @@ def test_delayed_reproject(
         fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, os.path.splitext(os.path.basename(fn))[0] + "_reproj.nc")
 
         def reproject_and_write(*args: Any, **kwargs: Any) -> None:
-
             # Run delayed reprojection
             reproj_arr_tmp = delayed_reproject(*args, **kwargs)
 
@@ -466,48 +648,4 @@ def reproject_and_write(*args: Any, **kwargs: Any) -> None:
         )
 
         # Check the measured memory usage is smaller than the maximum estimated one
-        assert measured_op_memusage < max_op_memusage
-
-        # 3/ Outputs check: load in memory and compare with a direct Rasterio reproject
-        reproj_arr = xr.open_dataset(fn_tmp_out)["test_reproj"].values
-
-        dst_arr = np.zeros(dst_shape)
-        _ = rio.warp.reproject(
-            np.array(darr),
-            dst_arr,
-            src_transform=src_transform,
-            src_crs=src_crs,
-            dst_transform=dst_transform,
-            dst_crs=dst_crs,
-            resampling=resampling,
-            src_nodata=src_nodata,
-            dst_nodata=dst_nodata,
-        )
-
-        # Keeping this to visualize Rasterio resampling issue
-        # if PLOT:
-        # import matplotlib.pyplot as plt
-        # plt.figure()
-        # plt.imshow((reproj_arr - dst_arr), cmap="RdYlBu", vmin=-0.2, vmax=0.2, interpolation="None")
-        # plt.colorbar()
-        # plt.savefig("/home/atom/ongoing/diff_close_zero.png", dpi=500)
-        # plt.figure()
-        # plt.imshow(np.abs(reproj_arr - dst_arr), cmap="RdYlBu", vmin=99997, vmax=100001, interpolation="None")
-        # plt.colorbar()
-        # plt.savefig("/home/atom/ongoing/diff_nodata.png", dpi=500)
-        # plt.figure()
-        # plt.imshow(dst_arr, cmap="RdYlBu", vmin=-1, vmax=1, interpolation="None")
-        # plt.colorbar()
-        # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
-
-        # Due to (what appears to be) Rasterio errors, we have to remain large here: even though some reprojections
-        # are pretty good, some can get a bit nasty
-        # Check that little data (less than 1% of pixels) are significantly different
-        ind_signif_diff = np.abs(reproj_arr - dst_arr) > 0.5
-        assert np.count_nonzero(ind_signif_diff) < 0.01 * reproj_arr.size
-
-        # The median difference should be negligible compared to the amplitude of the signal (+/- 1 std)
-        assert np.nanmedian(np.abs(reproj_arr - dst_arr)) < 0.1
-
-        # # Replace with allclose once Rasterio issue fixed?
-        # assert np.allclose(reproj_arr[~ind_both_nodata], dst_arr[~ind_both_nodata], atol=0.02)
+        assert measured_op_memusage < max_op_memusage
\ No newline at end of file

From a635c138b057fcdf533654c5948f9a0f2f4ab0af Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 17:49:42 -0800
Subject: [PATCH 12/22] Linting

---
 tests/test_raster/test_delayed.py | 52 +++++++++++++++++--------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 1478efec..42b16dad 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -242,7 +242,7 @@ class TestDelayed:
     In details:
     Set 1. We capture memory usage during the .compute() calls and check that only the expected amount of memory that
         we estimate independently (bytes used by one or several chunk combinations + metadata) is indeed used.
-    Set 2. We compare the outputs with the in-memory function specifically for input variables that influence the delayed
+    Set 2. We compare outputs with the in-memory function specifically for input variables that influence the delayed
         algorithm and might lead to new errors (for example: array shape to get subsample/points locations for
         subsample and interp_points, or destination chunksizes to map output of reproject).
 
@@ -282,7 +282,7 @@ class TestDelayed:
             small_darr = rng.normal(size=small_shape[0] * small_shape[1])
             # Add about half nodata values
             if w:
-                ind_nodata = rng.choice(small_darr.size, size=int(small_darr.size/2), replace=False)
+                ind_nodata = rng.choice(small_darr.size, size=int(small_darr.size / 2), replace=False)
                 small_darr[list(ind_nodata)] = np.nan
             small_darr = small_darr.reshape(small_shape[0], small_shape[1])
             list_small_darr.append(small_darr)
@@ -322,8 +322,9 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("ninterp", [2, 100])  # type: ignore
     @pytest.mark.parametrize("res", [(0.5, 2), (1, 1)])  # type: ignore
-    def test_delayed_interp_points__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], ninterp: int,
-                                           res: tuple[float, float]):
+    def test_delayed_interp_points__output(
+        self, darr: da.Array, chunksizes_in_mem: tuple[int, int], ninterp: int, res: tuple[float, float]
+    ):
         """
         Checks for delayed interpolate points function.
         Variables that influence specifically the delayed function are:
@@ -363,13 +364,13 @@ def test_delayed_interp_points__output(self, darr: da.Array, chunksizes_in_mem:
     # Same for shape
     @pytest.mark.parametrize("dst_shape_diff", [(0, 0), (-28, 117)])  # type: ignore
     def test_delayed_reproject__output(
-            self,
-            darr: da.Array,
-            chunksizes_in_mem: tuple[int, int],
-            dst_chunksizes: tuple[int, int],
-            dst_bounds_rel_shift: tuple[float, float],
-            dst_res_rel_fac: tuple[float, float],
-            dst_shape_diff: tuple[int, int],
+        self,
+        darr: da.Array,
+        chunksizes_in_mem: tuple[int, int],
+        dst_chunksizes: tuple[int, int],
+        dst_bounds_rel_shift: tuple[float, float],
+        dst_res_rel_fac: tuple[float, float],
+        dst_shape_diff: tuple[int, int],
     ):
         """
         Checks for the delayed reproject function.
@@ -427,7 +428,8 @@ def test_delayed_reproject__output(
             src_nodata=src_nodata,
             dst_nodata=dst_nodata,
             resampling=resampling,
-            dst_chunksizes=dst_chunksizes, )
+            dst_chunksizes=dst_chunksizes,
+        )
 
         # 3/ Outputs check: load in memory and compare with a direct Rasterio reproject
         reproj_arr = np.array(reproj_arr)
@@ -461,7 +463,7 @@ def test_delayed_reproject__output(
         # plt.colorbar()
         # plt.savefig("/home/atom/ongoing/dst.png", dpi=500)
 
-        # Due to (what appears to be) Rasterio errors, we have to remain unprecise for the checks here:
+        # Due to (what appears to be) Rasterio errors, we have to remain imprecise for the checks here:
         # even though some reprojections are pretty good, some can get a bit nasty
 
         # Check that little data (less than 10% of pixels) are significantly different
@@ -478,8 +480,9 @@ def test_delayed_reproject__output(
     @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
     @pytest.mark.parametrize("subsample_size", [100, 100000])  # type: ignore
-    def test_delayed_subsample__memusage(self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int,
-                                         cluster: Any):
+    def test_delayed_subsample__memusage(
+        self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any
+    ):
         """
         Checks for delayed subsampling function for memory usage on big file.
         (and also runs output checks as not long or too memory intensive in this case)
@@ -523,8 +526,9 @@ def test_delayed_subsample__memusage(self, fn: str, chunksizes_in_mem: tuple[int
     @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
     @pytest.mark.parametrize("ninterp", [100, 1000000])  # type: ignore
-    def test_delayed_interp_points__memusage(self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int,
-                                             cluster: Any):
+    def test_delayed_interp_points__memusage(
+        self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any
+    ):
         """
         Checks for delayed interpolate points function for memory usage on a big file.
         Variables that influence memory usage are:
@@ -558,12 +562,12 @@ def test_delayed_interp_points__memusage(self, fn: str, chunksizes_in_mem: tuple
     @pytest.mark.parametrize("dst_chunksizes", [(1000, 1000), (2500, 2500)])  # type: ignore
     @pytest.mark.parametrize("dst_bounds_rel_shift", [(1, 1), (2, 2)])  # type: ignore
     def test_delayed_reproject__memusage(
-            self,
-            fn: str,
-            chunksizes_in_mem: tuple[int, int],
-            dst_chunksizes: tuple[int, int],
-            dst_bounds_rel_shift: tuple[float, float],
-            cluster: Any,
+        self,
+        fn: str,
+        chunksizes_in_mem: tuple[int, int],
+        dst_chunksizes: tuple[int, int],
+        dst_bounds_rel_shift: tuple[float, float],
+        cluster: Any,
     ):
         """
         Checks for the delayed reproject function for memory usage on a big file.
@@ -648,4 +652,4 @@ def reproject_and_write(*args: Any, **kwargs: Any) -> None:
         )
 
         # Check the measured memory usage is smaller than the maximum estimated one
-        assert measured_op_memusage < max_op_memusage
\ No newline at end of file
+        assert measured_op_memusage < max_op_memusage

From a912734872455db1fbd852f5eed20e754672a397 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 19:27:56 -0800
Subject: [PATCH 13/22] Fix interp test for varying resolution by defining the
 coordinates

---
 geoutils/raster/delayed.py        |   4 +-
 tests/test_raster/test_delayed.py | 246 ++++++++++++++++--------------
 2 files changed, 131 insertions(+), 119 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 07da19d1..eed813c9 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -357,8 +357,8 @@ def delayed_interp_points(
     indexes_xi, indexes_yi = np.unravel_index(np.arange(len(blocks)), shape=(num_chunks[0], num_chunks[1]))
     block_ids = [
         {
-            "xstart": starts[0][indexes_xi[i]] - map_depth[method],
-            "ystart": starts[1][indexes_yi[i]] - map_depth[method],
+            "xstart": (starts[0][indexes_xi[i]] - map_depth[method]) * resolution[0],
+            "ystart": (starts[1][indexes_yi[i]] - map_depth[method]) * resolution[1],
             "xres": resolution[0],
             "yres": resolution[1],
         }
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 42b16dad..9efaac37 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import os
+import sys
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable
 
@@ -347,7 +348,8 @@ def test_delayed_interp_points__output(
         # Interpolate directly with Xarray (loads a lot in memory) and check results are exactly the same
         xx = xr.DataArray(interp_x, dims="z", name="x")
         yy = xr.DataArray(interp_y, dims="z", name="y")
-        ds = xr.DataArray(data=darr, dims=["x", "y"])
+        ds = xr.DataArray(data=darr, dims=["x", "y"], coords={"x": np.arange(0, darr.shape[0] * res[0], res[0]),
+                                                              "y": np.arange(0, darr.shape[1] * res[1], res[1])})
         interp2 = ds.interp(x=xx, y=yy)
         interp2.compute()
         interp2 = np.array(interp2.values)
@@ -393,6 +395,7 @@ def test_delayed_reproject__output(
         # 0/ Define input parameters
 
         # Get input and output shape
+        darr = darr.rechunk(chunksizes_in_mem)
         src_shape = darr.shape
         dst_shape = (src_shape[0] + dst_shape_diff[0], src_shape[1] + dst_shape_diff[1])
 
@@ -491,37 +494,40 @@ def test_delayed_subsample__memusage(
         - Chunksizes in memory.
         """
 
-        # 0/ Open dataset with chunks
-        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
-        darr = ds["test"].data
+        # Only check on linux
+        if sys.platform == "linux":
 
-        # 1/ Estimation of theoretical memory usage of the subsampling script
+            # 0/ Open dataset with chunks
+            ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
+            darr = ds["test"].data
 
-        max_op_memusage = _estimate_subsample_memusage(
-            darr=darr, chunksizes_in_mem=chunksizes_in_mem, subsample_size=subsample_size
-        )
+            # 1/ Estimation of theoretical memory usage of the subsampling script
 
-        # 2/ Run delayed subsample with dask memory usage monitoring
+            max_op_memusage = _estimate_subsample_memusage(
+                darr=darr, chunksizes_in_mem=chunksizes_in_mem, subsample_size=subsample_size
+            )
 
-        # Derive subsample from delayed function
-        # (passed to wrapper function to measure memory usage during execution)
-        sub, measured_op_memusage = _run_dask_measuring_memusage(
-            cluster, delayed_subsample, darr, subsample=subsample_size, random_state=42
-        )
+            # 2/ Run delayed subsample with dask memory usage monitoring
 
-        # Check the measured memory usage is smaller than the maximum estimated one
-        assert measured_op_memusage < max_op_memusage
+            # Derive subsample from delayed function
+            # (passed to wrapper function to measure memory usage during execution)
+            sub, measured_op_memusage = _run_dask_measuring_memusage(
+                cluster, delayed_subsample, darr, subsample=subsample_size, random_state=42
+            )
 
-        # 3/ Output checks
-        # The subsample should have exactly the prescribed length, with only valid values
-        assert len(sub) == subsample_size
-        assert all(np.isfinite(sub))
+            # Check the measured memory usage is smaller than the maximum estimated one
+            assert measured_op_memusage < max_op_memusage
 
-        # To verify the sampling works correctly, we can get its subsample indices with the argument return_indices
-        # And compare to the same subsample with vindex (now that we know the coordinates of valid values sampled)
-        indices = delayed_subsample(darr, subsample=subsample_size, random_state=42, return_indices=True)
-        sub2 = np.array(darr.vindex[indices[0], indices[1]])
-        assert np.array_equal(sub, sub2)
+            # 3/ Output checks
+            # The subsample should have exactly the prescribed length, with only valid values
+            assert len(sub) == subsample_size
+            assert all(np.isfinite(sub))
+
+            # To verify the sampling works correctly, we can get its subsample indices with the argument return_indices
+            # And compare to the same subsample with vindex (now that we know the coordinates of valid values sampled)
+            indices = delayed_subsample(darr, subsample=subsample_size, random_state=42, return_indices=True)
+            sub2 = np.array(darr.vindex[indices[0], indices[1]])
+            assert np.array_equal(sub, sub2)
 
     @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
@@ -536,26 +542,29 @@ def test_delayed_interp_points__memusage(
         - Chunksizes in memory.
         """
 
-        # 0/ Open dataset with chunks and create random point locations to interpolate
-        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
-        darr = ds["test"].data
+        # Only check on linux
+        if sys.platform == "linux":
 
-        rng = np.random.default_rng(seed=42)
-        interp_x = rng.choice(ds.x.size, ninterp) + rng.random(ninterp)
-        interp_y = rng.choice(ds.y.size, ninterp) + rng.random(ninterp)
+            # 0/ Open dataset with chunks and create random point locations to interpolate
+            ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
+            darr = ds["test"].data
 
-        # 1/ Estimation of theoretical memory usage of the subsampling script
-        max_op_memusage = _estimate_interp_points_memusage(
-            darr=darr, chunksizes_in_mem=chunksizes_in_mem, ninterp=ninterp
-        )
+            rng = np.random.default_rng(seed=42)
+            interp_x = rng.choice(ds.x.size, ninterp) + rng.random(ninterp)
+            interp_y = rng.choice(ds.y.size, ninterp) + rng.random(ninterp)
 
-        # 2/ Run interpolation of random point coordinates with memory monitoring
+            # 1/ Estimation of theoretical memory usage of the subsampling script
+            max_op_memusage = _estimate_interp_points_memusage(
+                darr=darr, chunksizes_in_mem=chunksizes_in_mem, ninterp=ninterp
+            )
 
-        interp1, measured_op_memusage = _run_dask_measuring_memusage(
-            cluster, delayed_interp_points, darr, points=(interp_x, interp_y), resolution=(1, 1)
-        )
-        # Check the measured memory usage is smaller than the maximum estimated one
-        assert measured_op_memusage < max_op_memusage
+            # 2/ Run interpolation of random point coordinates with memory monitoring
+
+            interp1, measured_op_memusage = _run_dask_measuring_memusage(
+                cluster, delayed_interp_points, darr, points=(interp_x, interp_y), resolution=(1, 1)
+            )
+            # Check the measured memory usage is smaller than the maximum estimated one
+            assert measured_op_memusage < max_op_memusage
 
     @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
@@ -577,79 +586,82 @@ def test_delayed_reproject__memusage(
         - Relative difference in resolution (potentially more/less source chunks to load for a destination chunk).
         """
 
-        # We fix arbitrary changes to the destination shape/resolution/bounds
-        # (already checked in details in the output tests)
-        dst_shape_diff = (25, -25)
-        dst_res_rel_fac = (1.5, 0.5)
-
-        # 0/ Open dataset with chunks and define variables
-        ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
-        darr = ds["test"].data
-
-        # Get input and output shape
-        src_shape = darr.shape
-        dst_shape = (src_shape[0], src_shape[1] + dst_shape_diff[1])
-
-        # Define arbitrary input/output CRS, they don't have a direct influence on the delayed method
-        # (as long as the input/output transforms intersect if projected in the same CRS)
-        src_crs = CRS(4326)
-        dst_crs = CRS(32630)
-
-        # Define arbitrary input transform, as we only care about the relative difference with the output transform
-        src_transform = rio.transform.from_bounds(10, 10, 15, 15, src_shape[0], src_shape[1])
-
-        # Other arguments having no influence
-        src_nodata = -9999
-        dst_nodata = 99999
-        resampling = rio.enums.Resampling.bilinear
-
-        # Get shifted dst_transform with new resolution
-        dst_transform = _build_dst_transform_shifted_newres(
-            src_transform=src_transform,
-            src_crs=src_crs,
-            dst_crs=dst_crs,
-            src_shape=src_shape,
-            bounds_rel_shift=dst_bounds_rel_shift,
-            res_rel_fac=dst_res_rel_fac,
-        )
-
-        # 1/ Estimation of theoretical memory usage of the subsampling script
-
-        max_op_memusage = _estimate_reproject_memusage(
-            darr, chunksizes_in_mem=chunksizes_in_mem, dst_chunksizes=dst_chunksizes, rel_res_fac=dst_res_rel_fac
-        )
-
-        # 2/ Run delayed reproject with memory monitoring
-
-        # We define a function where computes happens during writing to be able to measure memory usage
-        # (delayed_reproject returns a delayed array that might not fit in memory, unlike subsampling/interpolation)
-        fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, os.path.splitext(os.path.basename(fn))[0] + "_reproj.nc")
-
-        def reproject_and_write(*args: Any, **kwargs: Any) -> None:
-            # Run delayed reprojection
-            reproj_arr_tmp = delayed_reproject(*args, **kwargs)
-
-            # Save file out-of-memory and compute
-            data_arr = xr.DataArray(data=reproj_arr_tmp, dims=["x", "y"])
-            ds_out = xr.Dataset(data_vars={"test_reproj": data_arr})
-            write_delayed = ds_out.to_netcdf(fn_tmp_out, compute=False)
-            write_delayed.compute()
-
-        # And call this function with memory usage monitoring
-        _, measured_op_memusage = _run_dask_measuring_memusage(
-            cluster,
-            reproject_and_write,
-            darr,
-            src_transform=src_transform,
-            src_crs=src_crs,
-            dst_transform=dst_transform,
-            dst_crs=dst_crs,
-            dst_shape=dst_shape,
-            src_nodata=src_nodata,
-            dst_nodata=dst_nodata,
-            resampling=resampling,
-            dst_chunksizes=dst_chunksizes,
-        )
-
-        # Check the measured memory usage is smaller than the maximum estimated one
-        assert measured_op_memusage < max_op_memusage
+        # Only check on linux
+        if sys.platform == "linux":
+
+            # We fix arbitrary changes to the destination shape/resolution/bounds
+            # (already checked in details in the output tests)
+            dst_shape_diff = (25, -25)
+            dst_res_rel_fac = (1.5, 0.5)
+
+            # 0/ Open dataset with chunks and define variables
+            ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
+            darr = ds["test"].data
+
+            # Get input and output shape
+            src_shape = darr.shape
+            dst_shape = (src_shape[0], src_shape[1] + dst_shape_diff[1])
+
+            # Define arbitrary input/output CRS, they don't have a direct influence on the delayed method
+            # (as long as the input/output transforms intersect if projected in the same CRS)
+            src_crs = CRS(4326)
+            dst_crs = CRS(32630)
+
+            # Define arbitrary input transform, as we only care about the relative difference with the output transform
+            src_transform = rio.transform.from_bounds(10, 10, 15, 15, src_shape[0], src_shape[1])
+
+            # Other arguments having no influence
+            src_nodata = -9999
+            dst_nodata = 99999
+            resampling = rio.enums.Resampling.bilinear
+
+            # Get shifted dst_transform with new resolution
+            dst_transform = _build_dst_transform_shifted_newres(
+                src_transform=src_transform,
+                src_crs=src_crs,
+                dst_crs=dst_crs,
+                src_shape=src_shape,
+                bounds_rel_shift=dst_bounds_rel_shift,
+                res_rel_fac=dst_res_rel_fac,
+            )
+
+            # 1/ Estimation of theoretical memory usage of the subsampling script
+
+            max_op_memusage = _estimate_reproject_memusage(
+                darr, chunksizes_in_mem=chunksizes_in_mem, dst_chunksizes=dst_chunksizes, rel_res_fac=dst_res_rel_fac
+            )
+
+            # 2/ Run delayed reproject with memory monitoring
+
+            # We define a function where computes happens during writing to be able to measure memory usage
+            # (delayed_reproject returns a delayed array that might not fit in memory, unlike subsampling/interpolation)
+            fn_tmp_out = os.path.join(_EXAMPLES_DIRECTORY, os.path.splitext(os.path.basename(fn))[0] + "_reproj.nc")
+
+            def reproject_and_write(*args: Any, **kwargs: Any) -> None:
+                # Run delayed reprojection
+                reproj_arr_tmp = delayed_reproject(*args, **kwargs)
+
+                # Save file out-of-memory and compute
+                data_arr = xr.DataArray(data=reproj_arr_tmp, dims=["x", "y"])
+                ds_out = xr.Dataset(data_vars={"test_reproj": data_arr})
+                write_delayed = ds_out.to_netcdf(fn_tmp_out, compute=False)
+                write_delayed.compute()
+
+            # And call this function with memory usage monitoring
+            _, measured_op_memusage = _run_dask_measuring_memusage(
+                cluster,
+                reproject_and_write,
+                darr,
+                src_transform=src_transform,
+                src_crs=src_crs,
+                dst_transform=dst_transform,
+                dst_crs=dst_crs,
+                dst_shape=dst_shape,
+                src_nodata=src_nodata,
+                dst_nodata=dst_nodata,
+                resampling=resampling,
+                dst_chunksizes=dst_chunksizes,
+            )
+
+            # Check the measured memory usage is smaller than the maximum estimated one
+            assert measured_op_memusage < max_op_memusage

From a50b9de4a9603dc87ed6f938c9cbda2eae8660d1 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 19:28:16 -0800
Subject: [PATCH 14/22] Linting

---
 tests/test_raster/test_delayed.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 9efaac37..ea12bf5b 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -348,8 +348,14 @@ def test_delayed_interp_points__output(
         # Interpolate directly with Xarray (loads a lot in memory) and check results are exactly the same
         xx = xr.DataArray(interp_x, dims="z", name="x")
         yy = xr.DataArray(interp_y, dims="z", name="y")
-        ds = xr.DataArray(data=darr, dims=["x", "y"], coords={"x": np.arange(0, darr.shape[0] * res[0], res[0]),
-                                                              "y": np.arange(0, darr.shape[1] * res[1], res[1])})
+        ds = xr.DataArray(
+            data=darr,
+            dims=["x", "y"],
+            coords={
+                "x": np.arange(0, darr.shape[0] * res[0], res[0]),
+                "y": np.arange(0, darr.shape[1] * res[1], res[1]),
+            },
+        )
         interp2 = ds.interp(x=xx, y=yy)
         interp2.compute()
         interp2 = np.array(interp2.values)

From bd3469b66e2457031445982cf05ae47d62ed2a8c Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 26 Apr 2024 20:32:48 -0800
Subject: [PATCH 15/22] Remove small chunk memory usage check of interp_points

---
 tests/test_raster/test_delayed.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index ea12bf5b..cbfb63aa 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -113,13 +113,13 @@ def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[in
     fac_dask_margin = 2.5
     num_chunks = np.prod(darr.numblocks)
 
-    # Single chunk operation = (data type bytes + boolean from np.isfinite) * chunksize **2
-    chunk_memusage = (darr.dtype.itemsize + np.dtype("bool").itemsize) * np.prod(chunksizes_in_mem)
+    # Single chunk operation = (data type bytes + boolean from np.isfinite + its subset) * overlapping chunksize **2
+    chunk_memusage = (darr.dtype.itemsize + 2 * np.dtype("bool").itemsize) * np.prod(chunksizes_in_mem)
     # For interpolation, chunks have to overlap and temporarily load each neighbouring chunk,
-    # we add 8 neighbouring chunks
-    chunk_memusage += darr.dtype.itemsize * np.prod(chunksizes_in_mem) * 8
+    # we add 8 neighbouring chunks, and double the size due to the memory used during interpolation
+    chunk_memusage *= 9
 
-    # Outputs: interpolate coordinates
+    # Outputs: pair of interpolated coordinates
     out_memusage = np.dtype(darr.dtype).itemsize * ninterp * 2
 
     # Size of metadata passed to dask: number of blocks times its content
@@ -536,8 +536,8 @@ def test_delayed_subsample__memusage(
             assert np.array_equal(sub, sub2)
 
     @pytest.mark.parametrize("fn", [fn_large])  # type: ignore
-    @pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)])  # type: ignore
-    @pytest.mark.parametrize("ninterp", [100, 1000000])  # type: ignore
+    @pytest.mark.parametrize("chunksizes_in_mem", [(2000, 2000)])  # type: ignore
+    @pytest.mark.parametrize("ninterp", [100, 100000])  # type: ignore
     def test_delayed_interp_points__memusage(
         self, fn: str, chunksizes_in_mem: tuple[int, int], ninterp: int, cluster: Any
     ):
@@ -565,7 +565,6 @@ def test_delayed_interp_points__memusage(
             )
 
             # 2/ Run interpolation of random point coordinates with memory monitoring
-
             interp1, measured_op_memusage = _run_dask_measuring_memusage(
                 cluster, delayed_interp_points, darr, points=(interp_x, interp_y), resolution=(1, 1)
             )

From 40b423697d6d813dc5018f41d181840941005553 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Tue, 30 Apr 2024 14:08:31 -0800
Subject: [PATCH 16/22] Account for comments from amelie and amaury

---
 geoutils/raster/delayed.py        | 110 ++++++++++++++----------------
 tests/test_raster/test_delayed.py |  11 +--
 2 files changed, 59 insertions(+), 62 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index eed813c9..7c9e93f3 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -8,6 +8,7 @@
 
 import dask.array as da
 import dask.delayed
+from dask.utils import cached_cumsum
 import geopandas as gpd
 import numpy as np
 import pandas as pd
@@ -18,6 +19,7 @@
 from geoutils.projtools import _get_bounds_projected, _get_footprint_projected
 
 # 1/ SUBSAMPLING
+# At the date of April 2024:
 # Getting an exact subsample size out-of-memory only for valid values is not supported directly by Dask/Xarray
 
 # It is not trivial because we don't know where valid values will be in advance, and because of ragged output (varying
@@ -26,24 +28,7 @@
 # usage by having to drop an axis and re-chunk along 1D of the 2D array, so we use the dask.delayed solution instead)
 
 
-def _random_state_from_user_input(
-    random_state: np.random.RandomState | int | None = None,
-) -> np.random.RandomState | np.random.Generator:
-    """Define random state based on varied user input."""
-
-    # Define state for random sampling (to fix results during testing)
-    # Not using the legacy random call is crucial for RAM usage: https://github.com/numpy/numpy/issues/14169
-    if random_state is None:
-        rnd: np.random.RandomState | np.random.Generator = np.random.default_rng()
-    elif isinstance(random_state, np.random.RandomState):
-        rnd = random_state
-    else:
-        rnd = np.random.default_rng(seed=42)
-
-    return rnd
-
-
-def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids: int) -> int:
+def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids: int, silence_max_subsample: bool) -> int:
     """Get subsample size based on a user input of either integer size or fraction of the number of valid points."""
 
     # If value is between 0 and 1, use a fraction
@@ -54,11 +39,12 @@ def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids:
         # Use the number of valid points if larger than subsample asked by user
         npoints = min(int(subsample), total_nb_valids)
         if subsample > total_nb_valids:
-            warnings.warn(
-                f"Subsample value of {subsample} is larger than the number of valid pixels of {total_nb_valids},"
-                f"using all valid pixels as a subsample.",
-                category=UserWarning,
-            )
+            if not silence_max_subsample:
+                warnings.warn(
+                    f"Subsample value of {subsample} is larger than the number of valid pixels of {total_nb_valids},"
+                    f"using all valid pixels as a subsample.",
+                    category=UserWarning,
+                )
     else:
         raise ValueError("Subsample must be > 0.")
 
@@ -66,7 +52,7 @@ def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids:
 
 
 def _get_indices_block_per_subsample(
-    xxs: NDArrayNum, num_chunks: tuple[int, int], nb_valids_per_block: list[int]
+    indices_1d: NDArrayNum, num_chunks: tuple[int, int], nb_valids_per_block: list[int]
 ) -> list[list[int]]:
     """
     Get list of 1D valid subsample indices relative to the block for each block.
@@ -76,7 +62,7 @@ def _get_indices_block_per_subsample(
     valid values in that block (while the input indices go from zero to the total number of valid values in the full
     array).
 
-    :param xxs: Subsample 1D indexes among a total number of valid values.
+    :param indices_1d: Subsample 1D indexes among a total number of valid values.
     :param num_chunks: Number of chunks in X and Y.
     :param nb_valids_per_block: Number of valid pixels per block.
 
@@ -87,24 +73,24 @@ def _get_indices_block_per_subsample(
     valids_cumsum = np.cumsum(nb_valids_per_block)
 
     # We can write a faster algorithm by sorting
-    xxs = np.sort(xxs)
+    indices_1d = np.sort(indices_1d)
 
     # TODO: Write nested lists into array format to further save RAM?
     # We define a list of indices per block
-    relative_ind_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
+    relative_index_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
     k = 0  # K is the block number
-    for x in xxs:
+    for i in indices_1d:
 
         # Move to the next block K where current 1D subsample index is, if not in this one
-        while x >= valids_cumsum[k]:
+        while i >= valids_cumsum[k]:
             k += 1
 
         # Add 1D subsample index  relative to first subsample index of this block
-        first_xindex_block = valids_cumsum[k - 1] if k >= 1 else 0  # The first 1D valid subsample index of the block
-        relative_xindex = x - first_xindex_block
-        relative_ind_per_block[k].append(relative_xindex)
+        first_index_block = valids_cumsum[k - 1] if k >= 1 else 0  # The first 1D valid subsample index of the block
+        relative_index = i - first_index_block
+        relative_index_per_block[k].append(relative_index)
 
-    return relative_ind_per_block
+    return relative_index_per_block
 
 
 @dask.delayed  # type: ignore
@@ -142,7 +128,8 @@ def delayed_subsample(
     darr: da.Array,
     subsample: int | float = 1,
     return_indices: bool = False,
-    random_state: np.random.RandomState | int | None = None,
+    random_state: int | np.random.Generator | None = None,
+    silence_max_subsample: bool = False,
 ) -> NDArrayNum | tuple[NDArrayNum, NDArrayNum]:
     """
     Subsample a raster at valid values on out-of-memory chunks.
@@ -167,12 +154,14 @@ def delayed_subsample(
         If > 1 will be considered the number of valid pixels to extract.
     :param return_indices: If set to True, will return the extracted indices only.
     :param random_state: Random state, or seed number to use for random calculations.
+    :param silence_max_subsample: Whether to silence the warning for the subsample size being larger than the total
+        number of valid points (warns by default).
 
     :return: Subsample of values from the array (optionally, their indexes).
     """
 
     # Get random state
-    rnd = _random_state_from_user_input(random_state=random_state)
+    rng = np.random.default_rng(random_state)
 
     # Compute number of valid points for each block out-of-memory
     blocks = darr.to_delayed().ravel()
@@ -185,10 +174,11 @@ def delayed_subsample(
     total_nb_valids = np.sum(nb_valids_per_block)
 
     # Get subsample size (depending on user input)
-    subsample_size = _get_subsample_size_from_user_input(subsample=subsample, total_nb_valids=total_nb_valids)
+    subsample_size = _get_subsample_size_from_user_input(subsample=subsample, total_nb_valids=total_nb_valids,
+                                                         silence_max_subsample=silence_max_subsample)
 
     # Get random 1D indexes for the subsample size
-    indices_1d = rnd.choice(total_nb_valids, subsample_size, replace=False)
+    indices_1d = rng.choice(total_nb_valids, subsample_size, replace=False)
 
     # Sort which indexes belong to which chunk
     ind_per_block = _get_indices_block_per_subsample(
@@ -213,10 +203,10 @@ def delayed_subsample(
 
     # To return indices
     else:
-        # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
+        # Get starting 2D index for each chunk of the full array
+        # (mirroring what is done in block_id of dask.array.map_blocks)
         # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
-        from dask.utils import cached_cumsum
-
+        # This list also includes the last index as well (not used here)
         starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
         num_chunks = darr.numblocks
         # Get the starts per 1D block ID by unravelling starting indexes for each block
@@ -242,6 +232,7 @@ def delayed_subsample(
 
 
 # 2/ POINT INTERPOLATION ON REGULAR OR EQUAL GRID
+# At the date of April 2024:
 # This functionality is not covered efficiently by Dask/Xarray, because they need to support rectilinear grids, which
 # is difficult when interpolating in the chunked dimensions, and loads nearly all array memory when using .interp().
 
@@ -268,6 +259,8 @@ def _get_interp_indices_per_block(
     #  per block is not more computationally efficient?
     #  (as it uses array instead of nested lists, and nested lists grow in RAM very fast)
 
+    # The argument "starts" contains the list of chunk first X/Y index for the full array, plus the last index
+
     # We use one bucket per block, assuming a flattened blocks shape
     ind_per_block = [[] for _ in range(num_chunks[0] * num_chunks[1])]
     for i, (x, y) in enumerate(zip(interp_x, interp_y)):
@@ -338,10 +331,8 @@ def delayed_interp_points(
     chunksize = darr.chunksize
     expanded = da.overlap.overlap(darr, depth=map_depth[method], boundary=np.nan)
 
-    # Get robust list of starts (using what is done in block_id of dask.array.map_blocks)
-    # https://github.com/dask/dask/blob/24493f58660cb933855ba7629848881a6e2458c1/dask/array/core.py#L908
-    from dask.utils import cached_cumsum
-
+    # Get starting 2D index for each chunk of the full array
+    # (mirroring what is done in block_id of dask.array.map_blocks)
     starts = [cached_cumsum(c, initial_zero=True) for c in darr.chunks]
     num_chunks = expanded.numblocks
 
@@ -367,7 +358,7 @@ def delayed_interp_points(
 
     # Compute values delayed
     list_interp = [
-        _delayed_interp_points_block(blocks[i], block_ids[i], points_arr[:, ind_per_block[i]])
+        _delayed_interp_points_block(data_chunk, block_ids[i], points_arr[:, ind_per_block[i]])
         for i, data_chunk in enumerate(blocks)
         if len(ind_per_block[i]) > 0
     ]
@@ -387,6 +378,7 @@ def delayed_interp_points(
 
 
 # 3/ REPROJECT
+# At the date of April 2024: not supported by Rioxarray
 # Part of the code (defining a GeoGrid and GeoTiling classes) is inspired by
 # https://github.com/opendatacube/odc-geo/pull/88, modified to be concise, stand-alone and rely only on
 # Rasterio/GeoPandas
@@ -434,24 +426,25 @@ def width(self) -> int:
     def res(self) -> tuple[int, int]:
         return self.transform[0], abs(self.transform[4])
 
-    @property
-    def bounds(self) -> rio.coords.BoundingBox:
-        return rio.coords.BoundingBox(*rio.transform.array_bounds(self.height, self.width, self.transform))
-
     def bounds_projected(self, crs: rio.crs.CRS = None) -> rio.coords.BoundingBox:
         if crs is None:
             crs = self.crs
-        return _get_bounds_projected(bounds=self.bounds, in_crs=self.crs, out_crs=crs)
+        bounds = rio.coords.BoundingBox(*rio.transform.array_bounds(self.height, self.width, self.transform))
+        return _get_bounds_projected(bounds=bounds, in_crs=self.crs, out_crs=crs)
 
     @property
-    def footprint(self) -> gpd.GeoDataFrame:
-        return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=self.crs, densify_points=100)
+    def bounds(self) -> rio.coords.BoundingBox:
+        return self.bounds_projected()
 
     def footprint_projected(self, crs: rio.crs.CRS = None) -> gpd.GeoDataFrame:
         if crs is None:
             crs = self.crs
         return _get_footprint_projected(self.bounds, in_crs=self.crs, out_crs=crs, densify_points=100)
 
+    @property
+    def footprint(self) -> gpd.GeoDataFrame:
+        return self.footprint_projected()
+
     @classmethod
     def from_dict(cls: type[GeoGridType], dict_meta: dict[str, Any]) -> GeoGridType:
         """Create a GeoGrid from a dictionary containing transform, shape and CRS."""
@@ -473,11 +466,11 @@ def shift(
 
         # Convert pixel offsets to georeferenced units
         if distance_unit == "pixel":
-            # Multiplying the offset by the resolution might amplify floating precision issues
+            # Can either multiply the offset by the resolution
             # xoff *= self.res[0]
             # yoff *= self.res[1]
 
-            # Using the boundaries instead!
+            # Or use the boundaries instead! (maybe less floating point issues? doesn't seem to matter in tests)
             xoff = xoff / self.shape[1] * (self.bounds.right - self.bounds.left)
             yoff = yoff / self.shape[0] * (self.bounds.top - self.bounds.bottom)
 
@@ -499,6 +492,7 @@ def _get_block_ids_per_chunk(chunks: tuple[tuple[int, ...], tuple[int, ...]]) ->
     starts = [cached_cumsum(c, initial_zero=True) for c in chunks]
     nb_blocks = num_chunks[0] * num_chunks[1]
     ixi, iyi = np.unravel_index(np.arange(nb_blocks), shape=(num_chunks[0], num_chunks[1]))
+    # Starting and ending indexes "s" and "e" for both X/Y, to place the chunk in the full array
     block_ids = [
         {
             "num_block": i,
@@ -546,7 +540,7 @@ def get_blocks_as_geogrids(self) -> list[GeoGrid]:
         for bid in block_ids:
             # We get the block size
             block_shape = (bid["ye"] - bid["ys"], bid["xe"] - bid["xs"])
-            # Build a temporary geogrid with the same transform as the full grid
+            # Build a temporary geogrid with the same transform as the full grid, but with the chunk shape
             geogrid_tmp = GeoGrid(transform=self.grid.transform, crs=self.grid.crs, shape=block_shape)
             # And shift it to the right location (X is positive in index direction, Y is negative)
             geogrid_block = geogrid_tmp.shift(xoff=bid["xs"], yoff=-bid["ys"])
@@ -622,7 +616,8 @@ def _delayed_reproject_per_block(
 
     # If no source chunk intersects, we return a chunk of destination nodata values
     if len(src_arrs) == 0:
-        dst_arr = np.zeros(combined_meta["dst_shape"], dtype=np.dtype("float64"))
+        # We can use float32 to return NaN, will be cast to other floating type later if that's not source array dtype
+        dst_arr = np.zeros(combined_meta["dst_shape"], dtype=np.dtype("float32"))
         dst_arr[:] = kwargs["dst_nodata"]
         return dst_arr
 
@@ -654,6 +649,7 @@ def _delayed_reproject_per_block(
         resampling=kwargs["resampling"],
         src_nodata=kwargs["src_nodata"],
         dst_nodata=kwargs["dst_nodata"],
+        num_threads=1  # Force the number of threads to 1 to avoid Dask/Rasterio conflicting on multi-threading
     )
 
     return dst_arr
@@ -717,7 +713,7 @@ def delayed_reproject(
     # overlap, then map indexes of source blocks that intersect a given destination block
     src_footprints = src_geotiling.get_block_footprints(crs=dst_crs)
     dst_footprints = dst_geotiling.get_block_footprints().buffer(2 * max(dst_geogrid.res))
-    dest2source = [np.where(dst.intersects(src_footprints).geometry.values)[0] for dst in dst_footprints]
+    dest2source = [np.where(dst.intersects(src_footprints).values)[0] for dst in dst_footprints]
 
     # 3/ To reconstruct a square source array during chunked reprojection, we need to derive the combined shape and
     # transform of each tuples of source blocks
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index cbfb63aa..ea257984 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -5,6 +5,7 @@
 import sys
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable
+import time
 
 import dask.array as da
 import numpy as np
@@ -247,7 +248,8 @@ class TestDelayed:
         algorithm and might lead to new errors (for example: array shape to get subsample/points locations for
         subsample and interp_points, or destination chunksizes to map output of reproject).
 
-    We start with output checks which run faster when ordered before (maybe due to the cluster memory monitoring after).
+    We start with set 2: output checks which run faster when ordered before
+    (maybe due to the cluster memory monitoring after).
     """
 
     # Define random seed for generating test data
@@ -306,7 +308,6 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
         # 1/ We run the delayed function after re-chunking
         darr = darr.rechunk(chunksizes_in_mem)
         sub = delayed_subsample(darr, subsample=subsample_size, random_state=42)
-
         # 2/ Output checks
 
         # # The subsample should have exactly the prescribed length, with only valid values
@@ -390,9 +391,9 @@ def test_delayed_reproject__output(
         """
 
         # Keeping this commented here if we need to redo local tests due to Rasterio errors
-        # fn = list_fn[0]
-        # chunksizes_in_mem = (2000, 2000)
-        # dst_chunksizes = (1398, 2983)  # (2000, 2000)
+        # darr = list_small_darr[0]
+        # chunksizes_in_mem = list_small_chunksizes_in_mem[0]
+        # dst_chunksizes = list_small_chunksizes_in_mem[0]  # (2000, 2000)
         # dst_bounds_rel_shift = (0, 0)
         # dst_res_rel_fac = (0.45, 0.45)  # (1, 1)
         # dst_shape_diff = (0, 0)

From f50083da431a0720223872ffa2e093bc556aa0dc Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Tue, 30 Apr 2024 14:09:12 -0800
Subject: [PATCH 17/22] Linting

---
 geoutils/raster/delayed.py        | 13 ++++++++-----
 tests/test_raster/test_delayed.py |  1 -
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 7c9e93f3..c22ef197 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -8,11 +8,11 @@
 
 import dask.array as da
 import dask.delayed
-from dask.utils import cached_cumsum
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 import rasterio as rio
+from dask.utils import cached_cumsum
 from scipy.interpolate import interpn
 
 from geoutils._typing import NDArrayNum
@@ -28,7 +28,9 @@
 # usage by having to drop an axis and re-chunk along 1D of the 2D array, so we use the dask.delayed solution instead)
 
 
-def _get_subsample_size_from_user_input(subsample: int | float, total_nb_valids: int, silence_max_subsample: bool) -> int:
+def _get_subsample_size_from_user_input(
+    subsample: int | float, total_nb_valids: int, silence_max_subsample: bool
+) -> int:
     """Get subsample size based on a user input of either integer size or fraction of the number of valid points."""
 
     # If value is between 0 and 1, use a fraction
@@ -174,8 +176,9 @@ def delayed_subsample(
     total_nb_valids = np.sum(nb_valids_per_block)
 
     # Get subsample size (depending on user input)
-    subsample_size = _get_subsample_size_from_user_input(subsample=subsample, total_nb_valids=total_nb_valids,
-                                                         silence_max_subsample=silence_max_subsample)
+    subsample_size = _get_subsample_size_from_user_input(
+        subsample=subsample, total_nb_valids=total_nb_valids, silence_max_subsample=silence_max_subsample
+    )
 
     # Get random 1D indexes for the subsample size
     indices_1d = rng.choice(total_nb_valids, subsample_size, replace=False)
@@ -649,7 +652,7 @@ def _delayed_reproject_per_block(
         resampling=kwargs["resampling"],
         src_nodata=kwargs["src_nodata"],
         dst_nodata=kwargs["dst_nodata"],
-        num_threads=1  # Force the number of threads to 1 to avoid Dask/Rasterio conflicting on multi-threading
+        num_threads=1,  # Force the number of threads to 1 to avoid Dask/Rasterio conflicting on multi-threading
     )
 
     return dst_arr
diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index ea257984..c32455e1 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -5,7 +5,6 @@
 import sys
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable
-import time
 
 import dask.array as da
 import numpy as np

From a9fe770a0926811ded78509916347aee48998126 Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Fri, 17 May 2024 17:05:33 -0800
Subject: [PATCH 18/22] Increase base memory usage to avoid random test
 failures

---
 tests/test_raster/test_delayed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index c32455e1..266ffba5 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -95,8 +95,8 @@ def _estimate_subsample_memusage(darr: da.Array, chunksizes_in_mem: tuple[int, i
 
     # Final estimate of memory usage of operation in MB
     max_op_memusage = fac_dask_margin * (chunk_memusage + sample_memusage + out_memusage + meta_memusage) / (2**20)
-    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
-    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+    # We add a base memory usage of ~80 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 80 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 
@@ -134,8 +134,8 @@ def _estimate_interp_points_memusage(darr: da.Array, chunksizes_in_mem: tuple[in
 
     # Final estimate of memory usage of operation in MB
     max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2**20)
-    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
-    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+    # We add a base memory usage of ~80 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 80 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 
@@ -184,8 +184,8 @@ def _estimate_reproject_memusage(
 
     # Final estimate of memory usage of operation in MB
     max_op_memusage = fac_dask_margin * (chunk_memusage + out_memusage + meta_memusage) / (2**20)
-    # We add a base memory usage of ~50 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
-    max_op_memusage += 50 + 10 * (num_chunks / 1000)
+    # We add a base memory usage of ~80 MB + 10MB per 1000 chunks (loaded in background by Dask even on tiny data)
+    max_op_memusage += 80 + 10 * (num_chunks / 1000)
 
     return max_op_memusage
 

From 7c8e1db75dfec9c0285cf962c86ddd4f4efc83a8 Mon Sep 17 00:00:00 2001
From: Amelie <amelie.froessl96@gmail.com>
Date: Tue, 21 May 2024 14:19:34 +0200
Subject: [PATCH 19/22] feat: allowing boolean array in delayed_subsample

---
 geoutils/raster/delayed.py | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index c22ef197..9522cfc5 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -1,6 +1,7 @@
 """
 Module for dask-delayed functions for out-of-memory raster operations.
 """
+
 from __future__ import annotations
 
 import warnings
@@ -15,7 +16,7 @@
 from dask.utils import cached_cumsum
 from scipy.interpolate import interpn
 
-from geoutils._typing import NDArrayNum
+from geoutils._typing import NDArrayBool, NDArrayNum
 from geoutils.projtools import _get_bounds_projected, _get_footprint_projected
 
 # 1/ SUBSAMPLING
@@ -96,18 +97,22 @@ def _get_indices_block_per_subsample(
 
 
 @dask.delayed  # type: ignore
-def _delayed_nb_valids(arr_chunk: NDArrayNum) -> NDArrayNum:
+def _delayed_nb_valids(arr_chunk: NDArrayNum | NDArrayBool) -> NDArrayNum:
     """Count number of valid values per block."""
+    if arr_chunk.dtype == "bool":
+        return np.array([np.count_nonzero(arr_chunk)]).reshape((1, 1))
     return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))
 
 
 @dask.delayed  # type: ignore
-def _delayed_subsample_block(arr_chunk: NDArrayNum, subsample_indices: NDArrayNum) -> NDArrayNum:
+def _delayed_subsample_block(
+    arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum
+) -> NDArrayNum | NDArrayBool:
     """Subsample the valid values at the corresponding 1D valid indices per block."""
 
-    s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
-
-    return s_chunk
+    if arr_chunk.dtype == "bool":
+        return arr_chunk[arr_chunk][subsample_indices]  # type: ignore
+    return arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
 
 
 @dask.delayed  # type: ignore
@@ -116,8 +121,13 @@ def _delayed_subsample_indices_block(
 ) -> NDArrayNum:
     """Return 2D indices from the subsampled 1D valid indices per block."""
 
-    # Unravel indices of valid data to the shape of the block
-    ix, iy = np.unravel_index(np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape)
+    if arr_chunk.dtype == "bool":
+        ix, iy = np.unravel_index(np.argwhere(arr_chunk.flatten())[subsample_indices], shape=arr_chunk.shape)
+    else:
+        #  Unravel indices of valid data to the shape of the block
+        ix, iy = np.unravel_index(
+            np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape
+        )
 
     # Convert to full-array indexes by adding the row and column starting indexes for this block
     ix += block_id["xstart"]
@@ -722,9 +732,11 @@ def delayed_reproject(
     # transform of each tuples of source blocks
     src_block_ids = np.array(src_geotiling.get_block_locations())
     meta_params = [
-        _combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
-        if len(sbid) > 0
-        else ({}, [])
+        (
+            _combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
+            if len(sbid) > 0
+            else ({}, [])
+        )
         for sbid in dest2source
     ]
     # We also add the output transform/shape for this destination chunk in the combined meta

From 893f46b21e914d108e3cf761497bc107ff5915b8 Mon Sep 17 00:00:00 2001
From: Amelie <amelie.froessl96@gmail.com>
Date: Wed, 22 May 2024 10:23:55 +0200
Subject: [PATCH 20/22] docs: add boolean note to delayed_subsample

---
 geoutils/raster/delayed.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
index 9522cfc5..7435a222 100644
--- a/geoutils/raster/delayed.py
+++ b/geoutils/raster/delayed.py
@@ -111,13 +111,13 @@ def _delayed_subsample_block(
     """Subsample the valid values at the corresponding 1D valid indices per block."""
 
     if arr_chunk.dtype == "bool":
-        return arr_chunk[arr_chunk][subsample_indices]  # type: ignore
+        return arr_chunk[arr_chunk][subsample_indices]
     return arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
 
 
 @dask.delayed  # type: ignore
 def _delayed_subsample_indices_block(
-    arr_chunk: NDArrayNum, subsample_indices: NDArrayNum, block_id: dict[str, Any]
+    arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum, block_id: dict[str, Any]
 ) -> NDArrayNum:
     """Return 2D indices from the subsampled 1D valid indices per block."""
 
@@ -161,7 +161,10 @@ def delayed_subsample(
     return_indices=True, then sample your arrays out-of-memory with .vindex[indices[0], indices[1]]
     (this assumes that these arrays have valid values at the same locations).
 
-    :param darr: Input dask array.
+    Only valid values are sampled. If passing a numerical array, then only finite values are considered valid values.
+    If passing a boolean array, then only True values are considered valid values.
+
+    :param darr: Input dask array. This can be a boolean or a numerical array.
     :param subsample: Subsample size. If <= 1, will be considered a fraction of valid pixels to extract.
         If > 1 will be considered the number of valid pixels to extract.
     :param return_indices: If set to True, will return the extracted indices only.

From 31c7f390f8b18da2f393e95c43efd5d31d5f30ff Mon Sep 17 00:00:00 2001
From: Amelie <amelie.froessl96@gmail.com>
Date: Wed, 22 May 2024 10:24:19 +0200
Subject: [PATCH 21/22] tests: add tests for boolean input array for
 delayed_subsample

---
 tests/test_raster/test_delayed.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index c32455e1..ac207070 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -1,4 +1,5 @@
 """Tests for dask-delayed functions."""
+
 from __future__ import annotations
 
 import os
@@ -292,10 +293,18 @@ class TestDelayed:
     # List of in-memory chunksize for small tests
     list_small_chunksizes_in_mem = [(10, 10), (7, 19)]
 
-    @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
+    # create a corresponding boolean array for each numerical dask array
+    # every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
+    darr_bool = []
+    for small_darr in list_small_darr:
+        darr_bool.append(da.where(da.isfinite(small_darr), True, False))
+
+    @pytest.mark.parametrize("darr, darr_bool", list(zip(list_small_darr, darr_bool)))  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("subsample_size", [2, 100, 100000])  # type: ignore
-    def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int):
+    def test_delayed_subsample__output(
+        self, darr: da.Array, darr_bool: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int
+    ):
         """
         Checks for delayed subsampling function for output accuracy.
         Variables that influence specifically the delayed function and might lead to new errors are:
@@ -319,6 +328,15 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
         sub2 = np.array(darr.vindex[indices[0], indices[1]])
         assert np.array_equal(sub, sub2)
 
+        # Finally, to verify that a boolean array, with valid values at the same locations as the numerical array,
+        # leads to the same results, we compare the samples values and the samples indices.
+        darr_bool = darr_bool.rechunk(chunksizes_in_mem)
+        indices_bool = delayed_subsample(darr_bool, subsample=subsample_size, random_state=42, return_indices=True)
+        sub_bool = np.array(darr.vindex[indices_bool])
+        assert np.array_equal(sub, sub_bool)
+        assert np.array_equal(indices, indices_bool)
+
+
     @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("ninterp", [2, 100])  # type: ignore

From 431e8a349882b486c305bdf0f39e48862b8d56cb Mon Sep 17 00:00:00 2001
From: Romain Hugonnet <romain.hugonnet@gmail.com>
Date: Wed, 22 May 2024 10:55:21 -0800
Subject: [PATCH 22/22] Linting

---
 tests/test_raster/test_delayed.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
index 65c2d6eb..e87ec91a 100644
--- a/tests/test_raster/test_delayed.py
+++ b/tests/test_raster/test_delayed.py
@@ -293,8 +293,8 @@ class TestDelayed:
     # List of in-memory chunksize for small tests
     list_small_chunksizes_in_mem = [(10, 10), (7, 19)]
 
-    # create a corresponding boolean array for each numerical dask array
-    # every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
+    # Create a corresponding boolean array for each numerical dask array
+    # Every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
     darr_bool = []
     for small_darr in list_small_darr:
         darr_bool.append(da.where(da.isfinite(small_darr), True, False))
@@ -336,7 +336,6 @@ def test_delayed_subsample__output(
         assert np.array_equal(sub, sub_bool)
         assert np.array_equal(indices, indices_bool)
 
-
     @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("ninterp", [2, 100])  # type: ignore