Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add request wait #57

Merged
merged 2 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Upcoming

* Added support for :ref:`alternate localised pages <sitemap-extra-localisation>` with ``hreflang``.
* If an HTTP error is encountered, the contents of the error page is logged at ``INFO`` level.
* Added optional configurable wait time to HTTP request client.

v1.0.0 (2025-01-13)
-------------------
Expand Down
424 changes: 276 additions & 148 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ requests-mock = ">=1.6.0,<2.0"
pytest = "^8.3.0"
ruff = "^0.6.1"
vcrpy = "6.0.1"
pytest-mock = "^3.14.0"

[tool.poetry.group.perf]
optional = true
Expand Down
25 changes: 25 additions & 0 deletions tests/web_client/test_requests_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,28 @@ def test_error_page_log(self, client, requests_mock, caplog):
client.get(test_url)

assert "Response content: This page is broken." in caplog.text

@pytest.fixture
def mocked_sleep(self, mocker):
return mocker.patch("usp.web_client.abstract_client.time.sleep")

def test_no_request_wait(self, mocked_sleep):
client = RequestsWebClient()
client.get(self.TEST_BASE_URL + "/page1.html")
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_not_called()

def test_request_wait(self, mocked_sleep):
client = RequestsWebClient(wait=1)
client.get(self.TEST_BASE_URL + "/page1.html")
mocked_sleep.assert_not_called()
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_called_once_with(1)

def test_request_wait_random(self, mocked_sleep):
client = RequestsWebClient(wait=1, random_wait=True)
client.get(self.TEST_BASE_URL + "/page1.html")
client.get(self.TEST_BASE_URL + "/page2.html")
mocked_sleep.assert_called_once()
assert 0.5 <= mocked_sleep.call_args[0][0] <= 1.5
assert mocked_sleep.call_args[0][0] != 1
35 changes: 35 additions & 0 deletions usp/web_client/abstract_client.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Abstract web client class."""

import abc
import random
from http import HTTPStatus
import time
from typing import Optional

RETRYABLE_HTTP_STATUS_CODES = {
Expand Down Expand Up @@ -187,3 +189,36 @@ def set_max_response_data_length(

def get(self, url: str) -> AbstractWebClientResponse:
raise NoWebClientException


class RequestWaiter:
"""
Manages waiting between requests.
"""

def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
"""
:param wait: time to wait between requests, in seconds.
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
"""
self.wait_s = wait or 0
self.random_wait = random_wait
self.is_first = True

def wait(self) -> None:
"""Perform a wait if needed. Should be called before each request.

Will skip wait if this is the first request.
"""
if self.wait_s == 0:
return

if self.is_first:
self.is_first = False
return

wait_f = 1.0
if self.random_wait:
wait_f = random.uniform(0.5, 1.5)

time.sleep(self.wait_s * wait_f)
17 changes: 15 additions & 2 deletions usp/web_client/requests_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
AbstractWebClient,
AbstractWebClientResponse,
AbstractWebClientSuccessResponse,
RequestWaiter,
WebClientErrorResponse,
RETRYABLE_HTTP_STATUS_CODES,
)
Expand Down Expand Up @@ -79,16 +80,27 @@ class RequestsWebClient(AbstractWebClient):
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
"""

__slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
__slots__ = [
"__max_response_data_length",
"__timeout",
"__proxies",
"__verify",
"__waiter",
]

def __init__(self, verify=True):
def __init__(
self, verify=True, wait: Optional[float] = None, random_wait: bool = False
):
"""
:param verify: whether certificates should be verified for HTTPS requests.
:param wait: time to wait between requests, in seconds.
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
"""
self.__max_response_data_length = None
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
self.__proxies = {}
self.__verify = verify
self.__waiter = RequestWaiter(wait, random_wait)

def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
"""Set HTTP request timeout.
Expand All @@ -115,6 +127,7 @@ def set_max_response_data_length(self, max_response_data_length: int) -> None:
self.__max_response_data_length = max_response_data_length

def get(self, url: str) -> AbstractWebClientResponse:
self.__waiter.wait()
try:
response = requests.get(
url,
Expand Down
Loading