Skip to content

Commit

Permalink
Merge pull request #1138 from dandi/digest-empty-zarr
Browse files Browse the repository at this point in the history
Make internal Zarr-checksumming tree types support empty Zarrs
  • Loading branch information
yarikoptic authored Oct 14, 2022
2 parents 4f50102 + f298c46 commit 1b971d6
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
10 changes: 10 additions & 0 deletions dandi/cli/tests/test_digest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pathlib import Path

from click.testing import CliRunner
Expand Down Expand Up @@ -52,3 +53,12 @@ def test_digest_zarr():
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"


def test_digest_empty_zarr(tmp_path: Path) -> None:
runner = CliRunner()
with runner.isolated_filesystem():
os.mkdir("empty.zarr")
r = runner.invoke(digest, ["--digest", "zarr-checksum", "empty.zarr"])
assert r.exit_code == 0
assert r.output == "empty.zarr: 481a2f77ab786a0f45aafd5db0971caa-0--0\n"
30 changes: 25 additions & 5 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import hashlib
import logging
import os.path
from pathlib import Path
from pathlib import Path, PurePath
from typing import Dict, List, Optional, Tuple, Union, cast

from dandischema.digests.dandietag import DandiETag
Expand Down Expand Up @@ -123,10 +123,10 @@ def digest_file(f: Path) -> Tuple[Path, str, int]:
dgst = md5file_nocache(f)
return (f, dgst, os.path.getsize(f))

zcc = ZCDirectory()
zcc = ZCTree()
for p, digest, size in threaded_walk(path, digest_file):
zcc.add(p.relative_to(path), digest, size)
return zcc.get_digest_size()[0]
return zcc.get_digest()


@dataclass
Expand Down Expand Up @@ -169,10 +169,22 @@ def get_digest_size(self) -> Tuple[str, int]:
size += sz
return (cast(str, get_checksum(files, dirs)), size)

def add(self, path: Path, digest: str, size: int) -> None:

@dataclass
class ZCTree:
"""
Tree root node used for building an in-memory tree of Zarr entries and
their digests when calculating a complete Zarr checksum
:meta private:
"""

tree: ZCDirectory = field(init=False, default_factory=ZCDirectory)

def add(self, path: PurePath, digest: str, size: int) -> None:
*dirs, name = path.parts
parts = []
d = self
d = self.tree
for dirname in dirs:
parts.append(dirname)
e = d.children.setdefault(dirname, ZCDirectory())
Expand All @@ -185,6 +197,14 @@ def add(self, path: Path, digest: str, size: int) -> None:
assert name not in d.children, f"File {pstr} encountered twice"
d.children[name] = ZCFile(digest=digest, size=size)

def get_digest(self) -> str:
if self.tree.children:
return self.tree.get_digest_size()[0]
else:
# get_checksum() refuses to operate on empty directories, so we
# return the checksum for an empty Zarr ourselves:
return "481a2f77ab786a0f45aafd5db0971caa-0--0"


def md5file_nocache(filepath: Union[str, Path]) -> str:
"""
Expand Down
5 changes: 1 addition & 4 deletions dandi/support/tests/test_digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from pathlib import Path

import pytest
from pytest_mock import MockerFixture

from .. import digests
Expand Down Expand Up @@ -79,9 +78,7 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
assert get_zarr_checksum(tmp_path) == "25627e0fc7c609d10100d020f7782a25-8--197"
assert get_zarr_checksum(sub1) == "64af93ad7f8d471c00044d1ddbd4c0ba-4--97"

with pytest.raises(ValueError) as excinfo:
get_zarr_checksum(empty)
assert str(excinfo.value) == "Cannot compute a Zarr checksum for an empty directory"
assert get_zarr_checksum(empty) == "481a2f77ab786a0f45aafd5db0971caa-0--0"

spy = mocker.spy(digests, "md5file_nocache")
assert (
Expand Down

0 comments on commit 1b971d6

Please sign in to comment.