Skip to content

Commit

Permalink
ENH: respect/log separately Retry-After + support DANDI_DEVEL_AGGRESS…
Browse files Browse the repository at this point in the history
…IVE_RETRY mode

This is all to address that odd case with 000026 where connection keeps interrupting.
Unclear why so adding more specific cases handling and allowing for such an aggressive
retrying where we would proceed as long as we are getting something (but sleep would also increase)
  • Loading branch information
yarikoptic committed Sep 27, 2024
1 parent 4d0dad3 commit bfb73c0
Showing 1 changed file with 73 additions and 15 deletions.
88 changes: 73 additions & 15 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,14 +693,18 @@ def _download_file(
# TODO: how do we discover the total size????
# TODO: do not do it in-place, but rather into some "hidden" file
resuming = False
for attempt in range(3):
attempt = 0
nattempts = 3 # number to do, could be incremented if we downloaded a little
while attempt <= nattempts:
attempt += 1
try:
if digester:
downloaded_digest = digester() # start empty
warned = False
# I wonder if we could make writing async with downloader
with DownloadDirectory(path, digests or {}) as dldir:
assert dldir.offset is not None
downloaded_in_attempt = 0
downloaded = dldir.offset
resuming = downloaded > 0
if size is not None and downloaded == size:
Expand All @@ -719,6 +723,7 @@ def _download_file(
assert downloaded_digest is not None
downloaded_digest.update(block)
downloaded += len(block)
downloaded_in_attempt += len(block)
# TODO: yield progress etc
out: dict[str, Any] = {"done": downloaded}
if size:
Expand All @@ -744,30 +749,83 @@ def _download_file(
# Catching RequestException lets us retry on timeout & connection
# errors (among others) in addition to HTTP status errors.
except requests.RequestException as exc:
sleep_amount = random.random() * 5 * attempt
if os.environ.get("DANDI_DEVEL_AGGRESSIVE_RETRY"):

Check warning on line 753 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L752-L753

Added lines #L752 - L753 were not covered by tests
# in such a case if we downloaded a little more --
# consider it a successful attempt
if downloaded_in_attempt > 0:
lgr.debug(

Check warning on line 757 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L756-L757

Added lines #L756 - L757 were not covered by tests
"%s - download failed on attempt #%d: %s, "
"but did download %d bytes, so considering "
"it a success and incrementing number of allowed attempts.",
path,
attempt,
exc,
downloaded_in_attempt,
)
nattempts += 1

Check warning on line 766 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L766

Added line #L766 was not covered by tests
# TODO: actually we should probably retry only on selected codes,
# and also respect Retry-After
if attempt >= 2 or (
exc.response is not None
and exc.response.status_code
not in (
if exc.response is not None:
if exc.response.status_code not in (

Check warning on line 769 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L768-L769

Added lines #L768 - L769 were not covered by tests
400, # Bad Request, but happened with gider:
# https://github.com/dandi/dandi-cli/issues/87
*RETRY_STATUSES,
):
lgr.debug(

Check warning on line 774 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L774

Added line #L774 was not covered by tests
"%s - download failed due to response %d: %s",
path,
exc.response.status_code,
exc,
)
yield {"status": "error", "message": str(exc)}
return
elif retry_after := exc.response.headers.get("Retry-After"):

Check warning on line 782 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L780-L782

Added lines #L780 - L782 were not covered by tests
# playing safe
if not str(retry_after).isdigit():

Check warning on line 784 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L784

Added line #L784 was not covered by tests
# our code is wrong, do not crash but issue warning so
# we might get report/fix it up
lgr.warning(

Check warning on line 787 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L787

Added line #L787 was not covered by tests
"%s - download failed due to response %d with non-integer"
" Retry-After=%r: %s",
path,
exc.response.status_code,
retry_after,
exc,
)
yield {"status": "error", "message": str(exc)}
return
sleep_amount = int(retry_after)
lgr.debug(

Check warning on line 798 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L795-L798

Added lines #L795 - L798 were not covered by tests
"%s - download failed due to response %d with "
"Retry-After=%d: %s, will sleep and retry",
path,
exc.response.status_code,
sleep_amount,
exc,
)
else:
lgr.debug("%s - download failed: %s", path, exc)
yield {"status": "error", "message": str(exc)}
return
elif attempt >= nattempts:
lgr.debug(

Check warning on line 811 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L807-L811

Added lines #L807 - L811 were not covered by tests
"%s - download failed after %d attempts: %s", path, attempt, exc
)
):
lgr.debug("%s - download failed: %s", path, exc)
yield {"status": "error", "message": str(exc)}
return
# if is_access_denied(exc) or attempt >= 2:
# raise
# sleep a little and retry
lgr.debug(
"%s - failed to download on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
time.sleep(random.random() * 5)
else:
lgr.debug(

Check warning on line 820 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L820

Added line #L820 was not covered by tests
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
time.sleep(sleep_amount)

Check warning on line 826 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L826

Added line #L826 was not covered by tests
else:
lgr.warning("downloader logic: We should not be here!")

Check warning on line 828 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L828

Added line #L828 was not covered by tests

if downloaded_digest and not resuming:
assert downloaded_digest is not None
Expand Down

0 comments on commit bfb73c0

Please sign in to comment.