From f298c4601593c8aff7ddeeff0525872ad2341894 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 13 Oct 2022 17:18:38 -0400 Subject: [PATCH] Make internal Zarr-checksumming tree types support empty Zarrs --- dandi/cli/tests/test_digest.py | 10 ++++++++++ dandi/support/digests.py | 30 ++++++++++++++++++++++++----- dandi/support/tests/test_digests.py | 5 +---- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/dandi/cli/tests/test_digest.py b/dandi/cli/tests/test_digest.py index 3ead01109..c00ff0687 100644 --- a/dandi/cli/tests/test_digest.py +++ b/dandi/cli/tests/test_digest.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from click.testing import CliRunner @@ -52,3 +53,12 @@ def test_digest_zarr(): r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"]) assert r.exit_code == 0 assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n" + + +def test_digest_empty_zarr(tmp_path: Path) -> None: + runner = CliRunner() + with runner.isolated_filesystem(): + os.mkdir("empty.zarr") + r = runner.invoke(digest, ["--digest", "zarr-checksum", "empty.zarr"]) + assert r.exit_code == 0 + assert r.output == "empty.zarr: 481a2f77ab786a0f45aafd5db0971caa-0--0\n" diff --git a/dandi/support/digests.py b/dandi/support/digests.py index 7644a0044..d374bf701 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -15,7 +15,7 @@ import hashlib import logging import os.path -from pathlib import Path +from pathlib import Path, PurePath from typing import Dict, List, Optional, Tuple, Union, cast from dandischema.digests.dandietag import DandiETag @@ -123,10 +123,10 @@ def digest_file(f: Path) -> Tuple[Path, str, int]: dgst = md5file_nocache(f) return (f, dgst, os.path.getsize(f)) - zcc = ZCDirectory() + zcc = ZCTree() for p, digest, size in threaded_walk(path, digest_file): zcc.add(p.relative_to(path), digest, size) - return zcc.get_digest_size()[0] + return zcc.get_digest() @dataclass @@ -169,10 +169,22 @@ def get_digest_size(self) -> Tuple[str, int]: size += sz return (cast(str, get_checksum(files, dirs)), size) - def add(self, path: Path, digest: str, size: int) -> None: + +@dataclass +class ZCTree: + """ + Tree root node used for building an in-memory tree of Zarr entries and + their digests when calculating a complete Zarr checksum + + :meta private: + """ + + tree: ZCDirectory = field(init=False, default_factory=ZCDirectory) + + def add(self, path: PurePath, digest: str, size: int) -> None: *dirs, name = path.parts parts = [] - d = self + d = self.tree for dirname in dirs: parts.append(dirname) e = d.children.setdefault(dirname, ZCDirectory()) @@ -185,6 +197,14 @@ def add(self, path: Path, digest: str, size: int) -> None: assert name not in d.children, f"File {pstr} encountered twice" d.children[name] = ZCFile(digest=digest, size=size) + def get_digest(self) -> str: + if self.tree.children: + return self.tree.get_digest_size()[0] + else: + # get_checksum() refuses to operate on empty directories, so we + # return the checksum for an empty Zarr ourselves: + return "481a2f77ab786a0f45aafd5db0971caa-0--0" + def md5file_nocache(filepath: Union[str, Path]) -> str: """ diff --git a/dandi/support/tests/test_digests.py b/dandi/support/tests/test_digests.py index e19a9532b..d382b6cb7 100644 --- a/dandi/support/tests/test_digests.py +++ b/dandi/support/tests/test_digests.py @@ -9,7 +9,6 @@ from pathlib import Path -import pytest from pytest_mock import MockerFixture from .. import digests @@ -79,9 +78,7 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None: assert get_zarr_checksum(tmp_path) == "25627e0fc7c609d10100d020f7782a25-8--197" assert get_zarr_checksum(sub1) == "64af93ad7f8d471c00044d1ddbd4c0ba-4--97" - with pytest.raises(ValueError) as excinfo: - get_zarr_checksum(empty) - assert str(excinfo.value) == "Cannot compute a Zarr checksum for an empty directory" + assert get_zarr_checksum(empty) == "481a2f77ab786a0f45aafd5db0971caa-0--0" spy = mocker.spy(digests, "md5file_nocache") assert (