diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 7a9ebeff7..32c1ff66d 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -100,6 +100,15 @@ default="all", show_default=True, ) +@click.option( + "--preserve-tree", + is_flag=True, + help=( + "When downloading only part of a Dandiset, also download" + " `dandiset.yaml` and do not strip leading directories from asset" + " paths. Implies `--download all`." + ), +) @click.option( "--sync", is_flag=True, help="Delete local assets that do not exist on the server" ) @@ -138,6 +147,7 @@ def download( sync: bool, dandi_instance: str, path_type: PathType, + preserve_tree: bool, ) -> None: # We need to import the download module rather than the download function # so that the tests can properly patch the function with a mock. @@ -171,8 +181,9 @@ def download( format=format, jobs=jobs[0], jobs_per_zarr=jobs[1], - get_metadata="dandiset.yaml" in download_types, - get_assets="assets" in download_types, + get_metadata="dandiset.yaml" in download_types or preserve_tree, + get_assets="assets" in download_types or preserve_tree, + preserve_tree=preserve_tree, sync=sync, path_type=path_type, # develop_debug=develop_debug diff --git a/dandi/cli/tests/test_download.py b/dandi/cli/tests/test_download.py index 0a8131cff..4f10227c5 100644 --- a/dandi/cli/tests/test_download.py +++ b/dandi/cli/tests/test_download.py @@ -23,6 +23,7 @@ def test_download_defaults(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -41,6 +42,7 @@ def test_download_all_types(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -59,6 +61,7 @@ def test_download_metadata_only(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=False, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -77,6 +80,7 @@ def test_download_assets_only(mocker): jobs_per_zarr=None, get_metadata=False, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -110,6 +114,7 @@ def test_download_gui_instance_in_dandiset(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -135,6 +140,7 @@ def test_download_api_instance_in_dandiset(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) @@ -160,6 +166,7 @@ def test_download_url_instance_match(mocker): jobs_per_zarr=None, get_metadata=True, get_assets=True, + preserve_tree=False, sync=False, path_type=PathType.EXACT, ) diff --git a/dandi/dandiarchive.py b/dandi/dandiarchive.py index 9c0efd23a..40eb5fb3f 100644 --- a/dandi/dandiarchive.py +++ b/dandi/dandiarchive.py @@ -183,11 +183,17 @@ def navigate( yield (client, dandiset, assets) @abstractmethod - def get_asset_download_path(self, asset: BaseRemoteAsset) -> str: + def get_asset_download_path( + self, asset: BaseRemoteAsset, preserve_tree: bool + ) -> str: """ Returns the path (relative to the base download directory) at which the asset ``asset`` (assumed to have been returned by this object's - `get_assets()` method) should be downloaded + `get_assets()` method) should be downloaded. + + If ``preserve_tree`` is `True`, then the download is being performed + with ``--download tree`` option, and the method's return value should + be adjusted accordingly. :meta private: """ @@ -231,7 +237,9 @@ def get_assets( assert d is not None yield from d.get_assets(order=order) - def get_asset_download_path(self, asset: BaseRemoteAsset) -> str: + def get_asset_download_path( + self, asset: BaseRemoteAsset, preserve_tree: bool + ) -> str: return asset.path.lstrip("/") def is_under_download_path(self, path: str) -> bool: @@ -242,13 +250,17 @@ def is_under_download_path(self, path: str) -> bool: class SingleAssetURL(ParsedDandiURL): """Superclass for parsed URLs that refer to a single asset""" - def get_asset_download_path(self, asset: BaseRemoteAsset) -> str: - return posixpath.basename(asset.path.lstrip("/")) + def get_asset_download_path( + self, asset: BaseRemoteAsset, preserve_tree: bool + ) -> str: + path = asset.path.lstrip("/") + if preserve_tree: + return path + else: + return posixpath.basename(path) def is_under_download_path(self, path: str) -> bool: - raise TypeError( - f"{type(self).__name__}.is_under_download_path() should not be called" - ) + return False @dataclass @@ -257,8 +269,14 @@ class MultiAssetURL(ParsedDandiURL): path: str - def get_asset_download_path(self, asset: BaseRemoteAsset) -> str: - return multiasset_target(self.path, asset.path.lstrip("/")) + def get_asset_download_path( + self, asset: BaseRemoteAsset, preserve_tree: bool + ) -> str: + path = asset.path.lstrip("/") + if preserve_tree: + return path + else: + return multiasset_target(self.path, path) def is_under_download_path(self, path: str) -> bool: prefix = posixpath.dirname(self.path.strip("/")) @@ -487,7 +505,9 @@ def get_assets( if strict and not any_assets: raise NotFoundError(f"No assets found matching glob {self.path!r}") - def get_asset_download_path(self, asset: BaseRemoteAsset) -> str: + def get_asset_download_path( + self, asset: BaseRemoteAsset, preserve_tree: bool + ) -> str: return asset.path.lstrip("/") def is_under_download_path(self, path: str) -> bool: diff --git a/dandi/download.py b/dandi/download.py index d1627af08..2df711c78 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -93,6 +93,7 @@ def download( jobs_per_zarr: int | None = None, get_metadata: bool = True, get_assets: bool = True, + preserve_tree: bool = False, sync: bool = False, path_type: PathType = PathType.EXACT, ) -> None: @@ -141,6 +142,7 @@ def download( existing=existing, get_metadata=get_metadata, get_assets=get_assets, + preserve_tree=preserve_tree, jobs_per_zarr=jobs_per_zarr, on_error="yield" if format is DownloadFormat.PYOUT else "raise", **kw, @@ -201,6 +203,7 @@ class Downloader: existing: DownloadExisting get_metadata: bool get_assets: bool + preserve_tree: bool jobs_per_zarr: int | None on_error: Literal["raise", "yield"] #: which will be set .gen to assets. Purpose is to make it possible to get @@ -214,19 +217,24 @@ def __post_init__(self, output_dir: str | Path) -> None: # TODO: if we are ALREADY in a dandiset - we can validate that it is # the same dandiset and use that dandiset path as the one to download # under - if isinstance(self.url, DandisetURL): + if isinstance(self.url, DandisetURL) or ( + self.preserve_tree and self.url.dandiset_id is not None + ): assert self.url.dandiset_id is not None self.output_prefix = Path(self.url.dandiset_id) else: self.output_prefix = Path() self.output_path = Path(output_dir, self.output_prefix) + def is_dandiset_yaml(self) -> bool: + return isinstance(self.url, AssetItemURL) and self.url.path == "dandiset.yaml" + def download_generator(self) -> Iterator[dict]: """ A generator for downloads of files, folders, or entire dandiset from DANDI (as identified by URL) - This function is a generator which would yield records on ongoing + This function is a generator which yields records on ongoing activities. Activities include traversal of the remote resource (DANDI archive), download of individual assets while yielding records (TODO: schema) while validating their checksums "on the fly", etc. @@ -235,10 +243,8 @@ def download_generator(self) -> Iterator[dict]: with self.url.navigate(strict=True) as (client, dandiset, assets): if ( isinstance(self.url, DandisetURL) - or ( - isinstance(self.url, AssetItemURL) - and self.url.path == "dandiset.yaml" - ) + or self.is_dandiset_yaml() + or self.preserve_tree ) and self.get_metadata: assert dandiset is not None for resp in _populate_dandiset_yaml( @@ -248,7 +254,7 @@ def download_generator(self) -> Iterator[dict]: "path": str(self.output_prefix / dandiset_metadata_file), **resp, } - if isinstance(self.url, AssetItemURL): + if self.is_dandiset_yaml(): return # TODO: do analysis of assets for early detection of needed renames @@ -262,7 +268,9 @@ def download_generator(self) -> Iterator[dict]: assets = self.assets_it.feed(assets) lock = Lock() for asset in assets: - path = self.url.get_asset_download_path(asset) + path = self.url.get_asset_download_path( + asset, preserve_tree=self.preserve_tree + ) self.asset_download_paths.add(path) download_path = Path(self.output_path, path) path = str(self.output_prefix / path) @@ -995,7 +1003,9 @@ class DownloadProgress: @dataclass class ProgressCombiner: zarr_size: int - file_qty: int | None = None # set to specific known value whenever full sweep is complete + file_qty: int | None = ( + None # set to specific known value whenever full sweep is complete + ) files: dict[str, DownloadProgress] = field(default_factory=dict) #: Total size of all files that were not skipped and did not error out #: during download diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 57f0c572a..2a8c044ce 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -172,6 +172,28 @@ def test_download_folder(text_dandiset: SampleDandiset, tmp_path: Path) -> None: assert (tmp_path / "subdir2" / "coconut.txt").read_text() == "Coconut\n" +def test_download_folder_preserve_tree( + text_dandiset: SampleDandiset, tmp_path: Path +) -> None: + dandiset_id = text_dandiset.dandiset_id + download( + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/", + tmp_path, + preserve_tree=True, + ) + assert list_paths(tmp_path, dirs=True) == [ + tmp_path / dandiset_id, + tmp_path / dandiset_id / "dandiset.yaml", + tmp_path / dandiset_id / "subdir2", + tmp_path / dandiset_id / "subdir2" / "banana.txt", + tmp_path / dandiset_id / "subdir2" / "coconut.txt", + ] + assert (tmp_path / dandiset_id / "subdir2" / "banana.txt").read_text() == "Banana\n" + assert ( + tmp_path / dandiset_id / "subdir2" / "coconut.txt" + ).read_text() == "Coconut\n" + + def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None: dandiset_id = text_dandiset.dandiset_id download( @@ -182,6 +204,26 @@ def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None: assert (tmp_path / "coconut.txt").read_text() == "Coconut\n" +def test_download_item_preserve_tree( + text_dandiset: SampleDandiset, tmp_path: Path +) -> None: + dandiset_id = text_dandiset.dandiset_id + download( + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/coconut.txt", + tmp_path, + preserve_tree=True, + ) + assert list_paths(tmp_path, dirs=True) == [ + tmp_path / dandiset_id, + tmp_path / dandiset_id / "dandiset.yaml", + tmp_path / dandiset_id / "subdir2", + tmp_path / dandiset_id / "subdir2" / "coconut.txt", + ] + assert ( + tmp_path / dandiset_id / "subdir2" / "coconut.txt" + ).read_text() == "Coconut\n" + + def test_download_dandiset_yaml(text_dandiset: SampleDandiset, tmp_path: Path) -> None: dandiset_id = text_dandiset.dandiset_id download( @@ -330,6 +372,7 @@ def test_download_metadata404(text_dandiset: SampleDandiset, tmp_path: Path) -> existing=DownloadExisting.ERROR, get_metadata=True, get_assets=True, + preserve_tree=False, jobs_per_zarr=None, on_error="raise", ).download_generator() @@ -985,4 +1028,4 @@ def test_pyouthelper_time_remaining_1339(): # once done, dont print ETA assert len(done) == 2 else: - assert done[-1] == f"ETA: {10-i} seconds<" + assert done[-1] == f"ETA: {10 - i} seconds<" diff --git a/docs/source/cmdline/download.rst b/docs/source/cmdline/download.rst index e068194a4..b0c6a3760 100644 --- a/docs/source/cmdline/download.rst +++ b/docs/source/cmdline/download.rst @@ -46,6 +46,12 @@ Options Whether to interpret asset paths in URLs as exact matches or glob patterns +.. option:: --preserve-tree + + When downloading only part of a Dandiset, also download + :file:`dandiset.yaml` and do not strip leading directories from asset + paths. Implies ``--download all``. + .. option:: --sync Delete local assets that do not exist on the server after downloading