Skip to content

Commit

Permalink
Exclude special dotfiles from dandi digest -d zarr-checksum
Browse files Browse the repository at this point in the history
  • Loading branch information
jwodder committed Oct 28, 2022
1 parent 1e43339 commit cd341a9
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 3 deletions.
18 changes: 18 additions & 0 deletions dandi/cli/tests/test_digest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from pathlib import Path
import subprocess

from click.testing import CliRunner
import numpy as np
Expand Down Expand Up @@ -62,3 +63,20 @@ def test_digest_empty_zarr(tmp_path: Path) -> None:
r = runner.invoke(digest, ["--digest", "zarr-checksum", "empty.zarr"])
assert r.exit_code == 0
assert r.output == "empty.zarr: 481a2f77ab786a0f45aafd5db0971caa-0--0\n"


def test_digest_zarr_with_excluded_dotfiles():
# This test assumes that the Zarr serialization format never changes
runner = CliRunner()
with runner.isolated_filesystem():
dt = np.dtype("<i8")
zarr.save(
"sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
)
subprocess.run(["git", "init"], cwd="sample.zarr", check=True)
os.mkdir("sample.zarr/.dandi")
Path("sample.zarr", ".dandi", "somefile.txt").touch()
Path("sample.zarr", ".gitattributes").touch()
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
6 changes: 4 additions & 2 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from fscacher import PersistentCache

from .threaded_walk import threaded_walk
from ..utils import auto_repr
from ..utils import auto_repr, exclude_from_zarr

lgr = logging.getLogger("dandi.support.digests")

Expand Down Expand Up @@ -124,7 +124,9 @@ def digest_file(f: Path) -> Tuple[Path, str, int]:
return (f, dgst, os.path.getsize(f))

zcc = ZCTree()
for p, digest, size in threaded_walk(path, digest_file):
for p, digest, size in threaded_walk(
path, digest_file, exclude=lambda p: p.parent == path and exclude_from_zarr(p)
):
zcc.add(p.relative_to(path), digest, size)
return zcc.get_digest()

Expand Down
5 changes: 4 additions & 1 deletion dandi/support/threaded_walk.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def threaded_walk(
dirpath: Union[str, Path],
func: Optional[Callable[[Path], Any]] = None,
threads: int = 60,
exclude: Optional[Callable[[Path], Any]] = None,
) -> Iterable[Any]:
if not os.path.isdir(dirpath):
return
Expand All @@ -54,7 +55,9 @@ def worker() -> None:
break
try:
for p in path.iterdir():
if p.is_dir():
if exclude is not None and exclude(p):
log.debug("Excluding %s from traversal", p)
elif p.is_dir():
with lock:
tasks += 1
paths.append(p)
Expand Down
8 changes: 8 additions & 0 deletions dandi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,3 +788,11 @@ def is_page2_url(page1: str, page2: str) -> bool:
bits2 = urlparse(page2)
params2 = parse_qs(bits2.query)
return (bits1[:3], params1, bits1.fragment) == (bits2[:3], params2, bits2.fragment)


def exclude_from_zarr(path: Path) -> bool:
"""
Returns `True` if the ``path`` is a file or directory that should be
excluded from consideration when located at the root of a Zarr
"""
return path.name in (".dandi", ".datalad", ".git", ".gitattributes", ".gitmodules")

0 comments on commit cd341a9

Please sign in to comment.