diff --git a/dandi/cli/tests/test_digest.py b/dandi/cli/tests/test_digest.py index c00ff0687..550788116 100644 --- a/dandi/cli/tests/test_digest.py +++ b/dandi/cli/tests/test_digest.py @@ -1,5 +1,6 @@ import os from pathlib import Path +import subprocess from click.testing import CliRunner import numpy as np @@ -62,3 +63,23 @@ def test_digest_empty_zarr(tmp_path: Path) -> None: r = runner.invoke(digest, ["--digest", "zarr-checksum", "empty.zarr"]) assert r.exit_code == 0 assert r.output == "empty.zarr: 481a2f77ab786a0f45aafd5db0971caa-0--0\n" + + +def test_digest_zarr_with_excluded_dotfiles(): + # This test assumes that the Zarr serialization format never changes + runner = CliRunner() + with runner.isolated_filesystem(): + dt = np.dtype(" None: d = dirs.popleft() is_empty = True for p in list(d.iterdir()): - if ( + if exclude_from_zarr(p): + is_empty = False + elif ( p.is_file() and p.relative_to(zarr_basepath).as_posix() not in remote_paths ): diff --git a/dandi/files/_private.py b/dandi/files/_private.py index 3d0681ba2..c7db67d0e 100644 --- a/dandi/files/_private.py +++ b/dandi/files/_private.py @@ -34,9 +34,9 @@ class DandiFileType(Enum): @staticmethod def classify(path: Path) -> DandiFileType: if path.is_dir(): - if not any(path.iterdir()): - raise UnknownAssetError("Empty directories cannot be assets") if path.suffix in ZARR_EXTENSIONS: + if is_empty_zarr(path): + raise UnknownAssetError("Empty directories cannot be Zarr assets") return DandiFileType.ZARR raise UnknownAssetError( f"Directory has unrecognized suffix {path.suffix!r}" @@ -95,3 +95,9 @@ def __call__(self, filepath: Path, path: str) -> DandiFile: ) self.bids_dataset_description.dataset_files.append(df) return df + + +def is_empty_zarr(path: Path) -> bool: + """:meta private:""" + zf = ZarrAsset(filepath=path, path=path.name) + return not any(zf.iterfiles()) diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index 088bdab6e..212877bdb 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -33,7 +33,7 @@ from dandi.metadata import get_default_metadata from dandi.misctypes import BasePath, Digest from dandi.support.digests import get_digest, get_zarr_checksum, md5file_nocache -from dandi.utils import chunked, pluralize +from dandi.utils import chunked, exclude_from_zarr, pluralize from .bases import LocalDirectoryAsset from ..validate_types import Scope, Severity, ValidationOrigin, ValidationResult @@ -80,6 +80,8 @@ def is_dir(self) -> bool: def iterdir(self) -> Iterator[LocalZarrEntry]: for p in self.filepath.iterdir(): + if exclude_from_zarr(p): + continue if p.is_dir() and not any(p.iterdir()): # Ignore empty directories continue @@ -218,11 +220,7 @@ def get_validation_errors( message="Zarr group is empty.", ) ] - try: - next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) - except StopIteration: - pass - else: + if self._is_too_deep(): msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" if devel_debug: raise ValueError(msg) @@ -244,6 +242,12 @@ def get_validation_errors( schema_version=schema_version, devel_debug=devel_debug ) + def _is_too_deep(self) -> bool: + for e in self.iterfiles(): + if len(e.parts) >= MAX_ZARR_DEPTH + 1: + return True + return False + def iter_upload( self, dandiset: RemoteDandiset, diff --git a/dandi/support/digests.py b/dandi/support/digests.py index d374bf701..c0295be22 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -23,7 +23,7 @@ from fscacher import PersistentCache from .threaded_walk import threaded_walk -from ..utils import auto_repr +from ..utils import auto_repr, exclude_from_zarr lgr = logging.getLogger("dandi.support.digests") @@ -124,7 +124,7 @@ def digest_file(f: Path) -> Tuple[Path, str, int]: return (f, dgst, os.path.getsize(f)) zcc = ZCTree() - for p, digest, size in threaded_walk(path, digest_file): + for p, digest, size in threaded_walk(path, digest_file, exclude=exclude_from_zarr): zcc.add(p.relative_to(path), digest, size) return zcc.get_digest() diff --git a/dandi/support/threaded_walk.py b/dandi/support/threaded_walk.py index bab81338b..bf4fd3327 100644 --- a/dandi/support/threaded_walk.py +++ b/dandi/support/threaded_walk.py @@ -28,6 +28,7 @@ def threaded_walk( dirpath: Union[str, Path], func: Optional[Callable[[Path], Any]] = None, threads: int = 60, + exclude: Optional[Callable[[Path], Any]] = None, ) -> Iterable[Any]: if not os.path.isdir(dirpath): return @@ -54,7 +55,9 @@ def worker() -> None: break try: for p in path.iterdir(): - if p.is_dir(): + if exclude is not None and exclude(p): + log.debug("Excluding %s from traversal", p) + elif p.is_dir(): with lock: tasks += 1 paths.append(p) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 4734bac62..7570b4ecd 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -344,6 +344,40 @@ def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) ) +def test_download_different_zarr_onto_excluded_dotfiles( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + zarr_path = dd / "sample.zarr" + zarr.save(zarr_path, np.eye(5)) + (zarr_path / ".git").touch() + (zarr_path / ".dandi").mkdir() + (zarr_path / ".dandi" / "somefile.txt").touch() + (zarr_path / ".datalad").mkdir() + (zarr_path / ".gitattributes").touch() + (zarr_path / "arr_0").mkdir() + (zarr_path / "arr_0" / ".gitmodules").touch() + download( + zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + ) + assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == [ + zarr_path / ".dandi", + zarr_path / ".dandi" / "somefile.txt", + zarr_path / ".datalad", + zarr_path / ".git", + zarr_path / ".gitattributes", + zarr_path / ".zgroup", + zarr_path / "arr_0", + zarr_path / "arr_0" / ".gitmodules", + zarr_path / "arr_0" / ".zarray", + zarr_path / "arr_0" / "0", + zarr_path / "arr_1", + zarr_path / "arr_1" / ".zarray", + zarr_path / "arr_1" / "0", + ] + + def test_download_different_zarr_delete_dir( new_dandiset: SampleDandiset, tmp_path: Path ) -> None: diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 3d9859f83..4c95043ce 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -1,15 +1,19 @@ from operator import attrgetter from pathlib import Path +import subprocess from typing import cast from unittest.mock import ANY from dandischema.models import get_schema_version import numpy as np +import pytest import zarr +from .fixtures import SampleDandiset from .. import get_logger from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..dandiapi import AssetType, RemoteZarrAsset +from ..exceptions import UnknownAssetError from ..files import ( BIDSDatasetDescriptionAsset, DandisetMetadataFile, @@ -222,6 +226,23 @@ def test_find_dandi_files_with_bids(tmp_path: Path) -> None: assert asset.bids_dataset_description is bidsdd +def test_dandi_file_zarr_with_excluded_dotfiles(tmp_path: Path) -> None: + zarr_path = tmp_path / "foo.zarr" + mkpaths( + zarr_path, + ".git/data", + ".gitattributes", + ".dandi/somefile.txt", + ".datalad/", + "arr_0/.gitmodules", + ) + with pytest.raises(UnknownAssetError): + dandi_file(zarr_path) + (zarr_path / "arr_0" / "foo").touch() + zf = dandi_file(zarr_path) + assert isinstance(zf, ZarrAsset) + + def test_validate_simple1(simple1_nwb): # this file should be ok as long as schema_version is specified errors = dandi_file(simple1_nwb).get_validation_errors( @@ -341,3 +362,59 @@ def test_zarr_properties(tmp_path: Path) -> None: assert sorted(stat.files, key=attrgetter("parts")) == [ e for e in entries if e.is_file() ] + + +def test_upload_zarr_with_excluded_dotfiles( + new_dandiset: SampleDandiset, tmp_path: Path +) -> None: + filepath = tmp_path / "example.zarr" + zarr.save(filepath, np.arange(1000), np.arange(1000, 0, -1)) + subprocess.run(["git", "init"], cwd=str(filepath), check=True) + (filepath / ".dandi").mkdir() + (filepath / ".dandi" / "somefile.txt").write_text("Hello world!\n") + (filepath / ".gitattributes").write_text("* eol=lf\n") + (filepath / "arr_0" / ".gitmodules").write_text("# Empty\n") + (filepath / "arr_1" / ".datalad").mkdir() + (filepath / "arr_1" / ".datalad" / "config").write_text("# Empty\n") + zf = dandi_file(filepath) + assert isinstance(zf, ZarrAsset) + asset = zf.upload(new_dandiset.dandiset, {}) + assert isinstance(asset, RemoteZarrAsset) + local_entries = sorted(zf.iterfiles(include_dirs=True), key=attrgetter("parts")) + assert [str(e) for e in local_entries] == [ + ".zgroup", + "arr_0", + "arr_0/.zarray", + "arr_0/0", + "arr_1", + "arr_1/.zarray", + "arr_1/0", + ] + remote_entries = sorted(asset.iterfiles(include_dirs=True), key=attrgetter("parts")) + assert [str(e) for e in remote_entries] == [ + ".zgroup", + "arr_0", + "arr_0/.zarray", + "arr_0/0", + "arr_1", + "arr_1/.zarray", + "arr_1/0", + ] + + +def test_validate_deep_zarr(tmp_path: Path) -> None: + zarr_path = tmp_path / "foo.zarr" + zarr.save(zarr_path, np.arange(1000), np.arange(1000, 0, -1)) + mkpaths(zarr_path, "a/b/c/d/e/f/g.txt") + zf = dandi_file(zarr_path) + assert zf.get_validation_errors() == [] + mkpaths(zarr_path, "a/b/c/d/e/f/g/h.txt") + assert [e.id for e in zf.get_validation_errors()] == ["zarr.tree_depth_exceeded"] + + +def test_validate_zarr_deep_via_excluded_dotfiles(tmp_path: Path) -> None: + zarr_path = tmp_path / "foo.zarr" + zarr.save(zarr_path, np.arange(1000), np.arange(1000, 0, -1)) + mkpaths(zarr_path, ".git/a/b/c/d/e/f/g.txt", "a/b/c/.git/d/e/f/g.txt") + zf = dandi_file(zarr_path) + assert zf.get_validation_errors() == [] diff --git a/dandi/utils.py b/dandi/utils.py index 616d54373..23ec7e4c1 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -356,7 +356,9 @@ def good_file(path: str) -> bool: yield p -def list_paths(dirpath: Union[str, Path], dirs: bool = False) -> List[Path]: +def list_paths( + dirpath: Union[str, Path], dirs: bool = False, exclude_vcs: bool = True +) -> List[Path]: return sorted( map( Path, @@ -366,6 +368,7 @@ def list_paths(dirpath: Union[str, Path], dirs: bool = False) -> List[Path]: dirs=dirs, exclude_dotfiles=False, exclude_dotdirs=False, + exclude_vcs=exclude_vcs, ), ) ) @@ -788,3 +791,11 @@ def is_page2_url(page1: str, page2: str) -> bool: bits2 = urlparse(page2) params2 = parse_qs(bits2.query) return (bits1[:3], params1, bits1.fragment) == (bits2[:3], params2, bits2.fragment) + + +def exclude_from_zarr(path: Path) -> bool: + """ + Returns `True` if the ``path`` is a file or directory that should be + excluded from consideration when located in a Zarr + """ + return path.name in (".dandi", ".datalad", ".git", ".gitattributes", ".gitmodules")