Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve retrying downloads code and testing #1559

Merged
merged 20 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4cc65c2
Simplify: a single log.debug invocation for any case which leads to s…
yarikoptic Dec 20, 2024
88f3901
Retry-After is provided by 429 (too many) so retry on that too
yarikoptic Dec 20, 2024
6bb3146
Handle Retry-After as a date, and add tests for all logic of checking…
yarikoptic Dec 20, 2024
8b3f546
Replace UTC with use of timezone.utc for compatibility
yarikoptic Jan 29, 2025
47cc585
RF: centralize retry-after parsing logic, add to other places where R…
yarikoptic Jan 29, 2025
d2d046e
Merge remote-tracking branch 'origin/master' into gh-1556
yarikoptic Feb 3, 2025
7d2a858
ENH: remove stale comment (we retry only on selected), fix typo, info…
yarikoptic Feb 3, 2025
948b7f7
RF: rename function to reflect that it would sleep as well
yarikoptic Feb 3, 2025
6c9fa51
Replace "elif" with "if" for clarity
yarikoptic Feb 3, 2025
b4b2ea1
Make code cleaner and more Pythonic (use `None`, not "-1")
yarikoptic Feb 3, 2025
3883afc
Another pythonic tuneup
yarikoptic Feb 3, 2025
48eaa2b
Use try/except instead of isdecimal test
yarikoptic Feb 3, 2025
6aeaa61
RF+BF: dedent checking of sleep_amount, do not use if_unparsable
yarikoptic Feb 3, 2025
fa60dfd
Fix type info -- only Path is allowed
yarikoptic Feb 3, 2025
0fe489a
RF: use "if" not "elif" after a statement with return in the body
yarikoptic Feb 3, 2025
b2736e4
Add tests for odd times encountered
yarikoptic Feb 3, 2025
2b43e05
Apply suggestions from code review: improved docstring, use of %s for…
yarikoptic Feb 11, 2025
9cd62a2
RF: Do not bother logging yet another debug msg on Retry-After in dow…
yarikoptic Feb 11, 2025
7d236cf
Dedent log msg to always inform on how long we would sleep for one re…
yarikoptic Feb 11, 2025
2482103
Merge branch 'master' into gh-1556
yarikoptic Feb 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dandi/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def urls(self) -> Iterator[str]:

#: HTTP response status codes that should always be retried (until we run out
#: of retries)
RETRY_STATUSES = (500, 502, 503, 504)
RETRY_STATUSES = (429, 500, 502, 503, 504)

VIDEO_FILE_EXTENSIONS = [".mp4", ".avi", ".wmv", ".mov", ".flv", ".mkv"]
VIDEO_FILE_MODULES = ["processing", "acquisition"]
Expand Down
8 changes: 8 additions & 0 deletions dandi/dandiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
chunked,
ensure_datetime,
get_instance,
get_retry_after,
is_interactive,
is_page2_url,
joinurl,
Expand Down Expand Up @@ -236,6 +237,13 @@
)
if data is not None and hasattr(data, "seek"):
data.seek(0)
if retry_after := get_retry_after(result):
lgr.debug(

Check warning on line 241 in dandi/dandiapi.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiapi.py#L241

Added line #L241 was not covered by tests
"Sleeping for %d seconds as instructed in response "
"(in addition to tenacity imposed)",
retry_after,
)
sleep(retry_after)

Check warning on line 246 in dandi/dandiapi.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiapi.py#L246

Added line #L246 was not covered by tests
result.raise_for_status()
except Exception as e:
if isinstance(e, requests.HTTPError):
Expand Down
30 changes: 21 additions & 9 deletions dandi/dandiarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from .dandiapi import BaseRemoteAsset, DandiAPIClient, RemoteDandiset
from .exceptions import FailedToConnectError, NotFoundError, UnknownURLError
from .utils import get_instance
from .utils import get_instance, get_retry_after

lgr = get_logger()

Expand Down Expand Up @@ -893,14 +893,26 @@
while True:
r = requests.head(url, allow_redirects=True)
if r.status_code in RETRY_STATUSES and i < 4:
delay = 0.1 * 10**i
lgr.warning(
"HEAD request to %s returned %d; sleeping for %f seconds and then retrying...",
url,
r.status_code,
delay,
)
sleep(delay)
retry_after = get_retry_after(r)
if retry_after is not None:
delay = retry_after

Check warning on line 898 in dandi/dandiarchive.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiarchive.py#L896-L898

Added lines #L896 - L898 were not covered by tests
else:
delay = 0.1 * 10**i
if delay:

Check warning on line 901 in dandi/dandiarchive.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiarchive.py#L900-L901

Added lines #L900 - L901 were not covered by tests
lgr.warning(
"HEAD request to %s returned %d; "
"sleeping for %f seconds and then retrying...",
url,
r.status_code,
delay,
)
sleep(delay)

Check warning on line 909 in dandi/dandiarchive.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiarchive.py#L909

Added line #L909 was not covered by tests
else:
lgr.warning(
"HEAD request to %s returned %d; retrying...",
url,
r.status_code,
)
i += 1
continue
elif r.status_code == 404:
Expand Down
83 changes: 34 additions & 49 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
ensure_datetime,
exclude_from_zarr,
flattened,
get_retry_after,
is_same_time,
path_is_subpath,
pluralize,
Expand Down Expand Up @@ -796,7 +797,7 @@
# Catching RequestException lets us retry on timeout & connection
# errors (among others) in addition to HTTP status errors.
except requests.RequestException as exc:
attempts_allowed_or_not = _check_if_more_attempts_allowed(
attempts_allowed_or_not = _check_attempts_and_sleep(

Check warning on line 800 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L800

Added line #L800 was not covered by tests
path=path,
exc=exc,
attempt=attempt,
Expand Down Expand Up @@ -1080,15 +1081,29 @@
yield {"status": "done"}


def _check_if_more_attempts_allowed(
def _check_attempts_and_sleep(
path: Path,
exc: requests.RequestException,
attempt: int,
attempts_allowed: int,
downloaded_in_attempt: int,
downloaded_in_attempt: int = 0,
) -> int | None:
"""Check if we should retry the download, return potentially adjusted 'attempts_allowed'"""
sleep_amount = random.random() * 5 * attempt
"""
Check if we should retry the download, sleep if still allowed,
and return potentially adjusted 'attempts_allowed'

:param path: Destination of the download
:param exc: Exception raised during the last download attempt
:param attempt: The index of the last download attempt
:param attempts_allowed: The number of download attempts currently allowed
:param downloaded_in_attempt: The number of bytes downloaded in the last attempt

:returns: The number of download attempts allowed, potentially adjusted, if download
should be retried. None if download should not be retried.
Note: If download should be retried, this function sleeps before returning.
otherwise, it returns immediately.
"""
sleep_amount: float | None = None
if os.environ.get("DANDI_DOWNLOAD_AGGRESSIVE_RETRY"):
# in such a case if we downloaded a little more --
# consider it a successful attempt
Expand All @@ -1104,12 +1119,11 @@
)
attempts_allowed += 1
if attempt >= attempts_allowed:
lgr.debug("%s - download failed after %d attempts: %s", path, attempt, exc)
lgr.debug("%s - download failed after %d attempts: %s", path, attempt + 1, exc)
return None
# TODO: actually we should probably retry only on selected codes,
elif exc.response is not None:
if exc.response is not None:
if exc.response.status_code not in (
400, # Bad Request, but happened with gider:
400, # Bad Request, but happened with girder:
# https://github.com/dandi/dandi-cli/issues/87
*RETRY_STATUSES,
):
Expand All @@ -1120,46 +1134,17 @@
exc,
)
return None
elif retry_after := exc.response.headers.get("Retry-After"):
# playing safe
if not str(retry_after).isdigit():
# our code is wrong, do not crash but issue warning so
# we might get report/fix it up
lgr.warning(
"%s - download failed due to response %d with non-integer"
" Retry-After=%r: %s",
path,
exc.response.status_code,
retry_after,
exc,
)
return None
sleep_amount = int(retry_after)
lgr.debug(
"%s - download failed due to response %d with "
"Retry-After=%d: %s, will sleep and retry",
path,
exc.response.status_code,
sleep_amount,
exc,
)
else:
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
# if is_access_denied(exc) or attempt >= 2:
# raise
# sleep a little and retry
else:
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
sleep_amount = get_retry_after(exc.response)
if sleep_amount is None:
# it was not Retry-after set, so we come up with random duration to sleep
sleep_amount = random.random() * 5 * attempt
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep %f and retry",
path,
attempt,
exc,
sleep_amount,
)
time.sleep(sleep_amount)
return attempts_allowed

Expand Down
123 changes: 123 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from collections.abc import Callable
from contextlib import nullcontext
from email.utils import parsedate_to_datetime
from functools import partial
from glob import glob
import json
import logging
Expand All @@ -12,10 +14,13 @@
import re
from shutil import rmtree
import time
from unittest import mock

import numpy as np
import pytest
from pytest_mock import MockerFixture
import requests
from requests.exceptions import HTTPError
import responses
import zarr

Expand All @@ -32,6 +37,7 @@
PathType,
ProgressCombiner,
PYOUTHelper,
_check_attempts_and_sleep,
download,
)
from ..exceptions import NotFoundError
Expand Down Expand Up @@ -1169,3 +1175,120 @@ def test_DownloadDirectory_exc(
assert dl.dirpath.exists()
assert dl.fp is None
assert dl.writefile.read_bytes() == b"456"


def test__check_attempts_and_sleep() -> None:
f = partial(_check_attempts_and_sleep, Path("some/path"))

response403 = requests.Response()
response403.status_code = 403 # no retry

response500 = requests.Response()
response500.status_code = 500

# we do retry if cause is unknown (no response)
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# or if some 500
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response500), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# do not bother if already at limit
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=2, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# do not bother if 403
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response403), attempt=1, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# And in case of "Aggressive setting" when DANDI_DOWNLOAD_AGGRESSIVE_RETRY
# env var is set to 1, we retry if there was extra content downloaded
# patch env var DANDI_DOWNLOAD_AGGRESSIVE_RETRY
with mock.patch.dict(os.environ, {"DANDI_DOWNLOAD_AGGRESSIVE_RETRY": "1"}):
with mock.patch("time.sleep") as mock_sleep:
assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=0)
is None
)
mock_sleep.assert_not_called()

assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=1)
== 3
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0


@pytest.mark.parametrize("status_code", [429, 503])
def test__check_attempts_and_sleep_retries(status_code: int) -> None:
f = partial(_check_attempts_and_sleep, Path("some/path"))

response = requests.Response()
response.status_code = status_code

response.headers["Retry-After"] = "10"
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 10

response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:28:00 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
# shifted by 2 minutes
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:26:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 120

# we would still sleep some time if Retry-After is not decypherable
response.headers["Retry-After"] = "indecipherable"
with mock.patch("time.sleep") as mock_sleep:
assert (
_check_attempts_and_sleep(
Path("some/path"),
HTTPError(response=response),
attempt=1,
attempts_allowed=2,
)
== 2
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] > 0

# shifted by 1 year! (too long)
response.headers["Retry-After"] = "Wed, 21 Oct 2016 07:28:00 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:28:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
# and we do sleep some time
assert mock_sleep.call_args.args[0] > 0

# in the past second (too quick)
response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:27:59 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:28:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
# and we do not sleep really
assert not mock_sleep.call_args.args[0]
Loading