Skip to content

Commit

Permalink
Merge pull request #1147 from dandi/gh-1146
Browse files Browse the repository at this point in the history
Exclude special dotfiles from Zarrs
  • Loading branch information
yarikoptic authored Oct 31, 2022
2 parents 48a222f + 0064071 commit b23fee4
Show file tree
Hide file tree
Showing 9 changed files with 172 additions and 13 deletions.
21 changes: 21 additions & 0 deletions dandi/cli/tests/test_digest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from pathlib import Path
import subprocess

from click.testing import CliRunner
import numpy as np
Expand Down Expand Up @@ -62,3 +63,23 @@ def test_digest_empty_zarr(tmp_path: Path) -> None:
r = runner.invoke(digest, ["--digest", "zarr-checksum", "empty.zarr"])
assert r.exit_code == 0
assert r.output == "empty.zarr: 481a2f77ab786a0f45aafd5db0971caa-0--0\n"


def test_digest_zarr_with_excluded_dotfiles():
# This test assumes that the Zarr serialization format never changes
runner = CliRunner()
with runner.isolated_filesystem():
dt = np.dtype("<i8")
zarr.save(
"sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
)
subprocess.run(["git", "init"], cwd="sample.zarr", check=True)
os.mkdir("sample.zarr/.dandi")
Path("sample.zarr", ".dandi", "somefile.txt").touch()
Path("sample.zarr", ".gitattributes").touch()
Path("sample.zarr", "arr_0", ".gitmodules").touch()
Path("sample.zarr", "arr_1", ".datalad").mkdir()
Path("sample.zarr", "arr_1", ".datalad", "config").touch()
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
5 changes: 4 additions & 1 deletion dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from .utils import (
abbrev_prompt,
ensure_datetime,
exclude_from_zarr,
flattened,
is_same_time,
on_windows,
Expand Down Expand Up @@ -879,7 +880,9 @@ def digest_callback(path: str, algoname: str, d: str) -> None:
d = dirs.popleft()
is_empty = True
for p in list(d.iterdir()):
if (
if exclude_from_zarr(p):
is_empty = False
elif (
p.is_file()
and p.relative_to(zarr_basepath).as_posix() not in remote_paths
):
Expand Down
10 changes: 8 additions & 2 deletions dandi/files/_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class DandiFileType(Enum):
@staticmethod
def classify(path: Path) -> DandiFileType:
if path.is_dir():
if not any(path.iterdir()):
raise UnknownAssetError("Empty directories cannot be assets")
if path.suffix in ZARR_EXTENSIONS:
if is_empty_zarr(path):
raise UnknownAssetError("Empty directories cannot be Zarr assets")
return DandiFileType.ZARR
raise UnknownAssetError(
f"Directory has unrecognized suffix {path.suffix!r}"
Expand Down Expand Up @@ -95,3 +95,9 @@ def __call__(self, filepath: Path, path: str) -> DandiFile:
)
self.bids_dataset_description.dataset_files.append(df)
return df


def is_empty_zarr(path: Path) -> bool:
""":meta private:"""
zf = ZarrAsset(filepath=path, path=path.name)
return not any(zf.iterfiles())
16 changes: 10 additions & 6 deletions dandi/files/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from dandi.metadata import get_default_metadata
from dandi.misctypes import BasePath, Digest
from dandi.support.digests import get_digest, get_zarr_checksum, md5file_nocache
from dandi.utils import chunked, pluralize
from dandi.utils import chunked, exclude_from_zarr, pluralize

from .bases import LocalDirectoryAsset
from ..validate_types import Scope, Severity, ValidationOrigin, ValidationResult
Expand Down Expand Up @@ -80,6 +80,8 @@ def is_dir(self) -> bool:

def iterdir(self) -> Iterator[LocalZarrEntry]:
for p in self.filepath.iterdir():
if exclude_from_zarr(p):
continue
if p.is_dir() and not any(p.iterdir()):
# Ignore empty directories
continue
Expand Down Expand Up @@ -218,11 +220,7 @@ def get_validation_errors(
message="Zarr group is empty.",
)
]
try:
next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH)))
except StopIteration:
pass
else:
if self._is_too_deep():
msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep"
if devel_debug:
raise ValueError(msg)
Expand All @@ -244,6 +242,12 @@ def get_validation_errors(
schema_version=schema_version, devel_debug=devel_debug
)

def _is_too_deep(self) -> bool:
for e in self.iterfiles():
if len(e.parts) >= MAX_ZARR_DEPTH + 1:
return True
return False

def iter_upload(
self,
dandiset: RemoteDandiset,
Expand Down
4 changes: 2 additions & 2 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from fscacher import PersistentCache

from .threaded_walk import threaded_walk
from ..utils import auto_repr
from ..utils import auto_repr, exclude_from_zarr

lgr = logging.getLogger("dandi.support.digests")

Expand Down Expand Up @@ -124,7 +124,7 @@ def digest_file(f: Path) -> Tuple[Path, str, int]:
return (f, dgst, os.path.getsize(f))

zcc = ZCTree()
for p, digest, size in threaded_walk(path, digest_file):
for p, digest, size in threaded_walk(path, digest_file, exclude=exclude_from_zarr):
zcc.add(p.relative_to(path), digest, size)
return zcc.get_digest()

Expand Down
5 changes: 4 additions & 1 deletion dandi/support/threaded_walk.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def threaded_walk(
dirpath: Union[str, Path],
func: Optional[Callable[[Path], Any]] = None,
threads: int = 60,
exclude: Optional[Callable[[Path], Any]] = None,
) -> Iterable[Any]:
if not os.path.isdir(dirpath):
return
Expand All @@ -54,7 +55,9 @@ def worker() -> None:
break
try:
for p in path.iterdir():
if p.is_dir():
if exclude is not None and exclude(p):
log.debug("Excluding %s from traversal", p)
elif p.is_dir():
with lock:
tasks += 1
paths.append(p)
Expand Down
34 changes: 34 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,40 @@ def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset)
)


def test_download_different_zarr_onto_excluded_dotfiles(
tmp_path: Path, zarr_dandiset: SampleDandiset
) -> None:
dd = tmp_path / zarr_dandiset.dandiset_id
dd.mkdir()
zarr_path = dd / "sample.zarr"
zarr.save(zarr_path, np.eye(5))
(zarr_path / ".git").touch()
(zarr_path / ".dandi").mkdir()
(zarr_path / ".dandi" / "somefile.txt").touch()
(zarr_path / ".datalad").mkdir()
(zarr_path / ".gitattributes").touch()
(zarr_path / "arr_0").mkdir()
(zarr_path / "arr_0" / ".gitmodules").touch()
download(
zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different"
)
assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == [
zarr_path / ".dandi",
zarr_path / ".dandi" / "somefile.txt",
zarr_path / ".datalad",
zarr_path / ".git",
zarr_path / ".gitattributes",
zarr_path / ".zgroup",
zarr_path / "arr_0",
zarr_path / "arr_0" / ".gitmodules",
zarr_path / "arr_0" / ".zarray",
zarr_path / "arr_0" / "0",
zarr_path / "arr_1",
zarr_path / "arr_1" / ".zarray",
zarr_path / "arr_1" / "0",
]


def test_download_different_zarr_delete_dir(
new_dandiset: SampleDandiset, tmp_path: Path
) -> None:
Expand Down
77 changes: 77 additions & 0 deletions dandi/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from operator import attrgetter
from pathlib import Path
import subprocess
from typing import cast
from unittest.mock import ANY

from dandischema.models import get_schema_version
import numpy as np
import pytest
import zarr

from .fixtures import SampleDandiset
from .. import get_logger
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..dandiapi import AssetType, RemoteZarrAsset
from ..exceptions import UnknownAssetError
from ..files import (
BIDSDatasetDescriptionAsset,
DandisetMetadataFile,
Expand Down Expand Up @@ -222,6 +226,23 @@ def test_find_dandi_files_with_bids(tmp_path: Path) -> None:
assert asset.bids_dataset_description is bidsdd


def test_dandi_file_zarr_with_excluded_dotfiles(tmp_path: Path) -> None:
zarr_path = tmp_path / "foo.zarr"
mkpaths(
zarr_path,
".git/data",
".gitattributes",
".dandi/somefile.txt",
".datalad/",
"arr_0/.gitmodules",
)
with pytest.raises(UnknownAssetError):
dandi_file(zarr_path)
(zarr_path / "arr_0" / "foo").touch()
zf = dandi_file(zarr_path)
assert isinstance(zf, ZarrAsset)


def test_validate_simple1(simple1_nwb):
# this file should be ok as long as schema_version is specified
errors = dandi_file(simple1_nwb).get_validation_errors(
Expand Down Expand Up @@ -341,3 +362,59 @@ def test_zarr_properties(tmp_path: Path) -> None:
assert sorted(stat.files, key=attrgetter("parts")) == [
e for e in entries if e.is_file()
]


def test_upload_zarr_with_excluded_dotfiles(
new_dandiset: SampleDandiset, tmp_path: Path
) -> None:
filepath = tmp_path / "example.zarr"
zarr.save(filepath, np.arange(1000), np.arange(1000, 0, -1))
subprocess.run(["git", "init"], cwd=str(filepath), check=True)
(filepath / ".dandi").mkdir()
(filepath / ".dandi" / "somefile.txt").write_text("Hello world!\n")
(filepath / ".gitattributes").write_text("* eol=lf\n")
(filepath / "arr_0" / ".gitmodules").write_text("# Empty\n")
(filepath / "arr_1" / ".datalad").mkdir()
(filepath / "arr_1" / ".datalad" / "config").write_text("# Empty\n")
zf = dandi_file(filepath)
assert isinstance(zf, ZarrAsset)
asset = zf.upload(new_dandiset.dandiset, {})
assert isinstance(asset, RemoteZarrAsset)
local_entries = sorted(zf.iterfiles(include_dirs=True), key=attrgetter("parts"))
assert [str(e) for e in local_entries] == [
".zgroup",
"arr_0",
"arr_0/.zarray",
"arr_0/0",
"arr_1",
"arr_1/.zarray",
"arr_1/0",
]
remote_entries = sorted(asset.iterfiles(include_dirs=True), key=attrgetter("parts"))
assert [str(e) for e in remote_entries] == [
".zgroup",
"arr_0",
"arr_0/.zarray",
"arr_0/0",
"arr_1",
"arr_1/.zarray",
"arr_1/0",
]


def test_validate_deep_zarr(tmp_path: Path) -> None:
zarr_path = tmp_path / "foo.zarr"
zarr.save(zarr_path, np.arange(1000), np.arange(1000, 0, -1))
mkpaths(zarr_path, "a/b/c/d/e/f/g.txt")
zf = dandi_file(zarr_path)
assert zf.get_validation_errors() == []
mkpaths(zarr_path, "a/b/c/d/e/f/g/h.txt")
assert [e.id for e in zf.get_validation_errors()] == ["zarr.tree_depth_exceeded"]


def test_validate_zarr_deep_via_excluded_dotfiles(tmp_path: Path) -> None:
zarr_path = tmp_path / "foo.zarr"
zarr.save(zarr_path, np.arange(1000), np.arange(1000, 0, -1))
mkpaths(zarr_path, ".git/a/b/c/d/e/f/g.txt", "a/b/c/.git/d/e/f/g.txt")
zf = dandi_file(zarr_path)
assert zf.get_validation_errors() == []
13 changes: 12 additions & 1 deletion dandi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,9 @@ def good_file(path: str) -> bool:
yield p


def list_paths(dirpath: Union[str, Path], dirs: bool = False) -> List[Path]:
def list_paths(
dirpath: Union[str, Path], dirs: bool = False, exclude_vcs: bool = True
) -> List[Path]:
return sorted(
map(
Path,
Expand All @@ -366,6 +368,7 @@ def list_paths(dirpath: Union[str, Path], dirs: bool = False) -> List[Path]:
dirs=dirs,
exclude_dotfiles=False,
exclude_dotdirs=False,
exclude_vcs=exclude_vcs,
),
)
)
Expand Down Expand Up @@ -788,3 +791,11 @@ def is_page2_url(page1: str, page2: str) -> bool:
bits2 = urlparse(page2)
params2 = parse_qs(bits2.query)
return (bits1[:3], params1, bits1.fragment) == (bits2[:3], params2, bits2.fragment)


def exclude_from_zarr(path: Path) -> bool:
"""
Returns `True` if the ``path`` is a file or directory that should be
excluded from consideration when located in a Zarr
"""
return path.name in (".dandi", ".datalad", ".git", ".gitattributes", ".gitmodules")

0 comments on commit b23fee4

Please sign in to comment.