-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapi.py
104 lines (85 loc) · 2.76 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""External Resources API"""
import logging
from typing import Optional
import requests
from external_resources.constants import (
RESOURCE_BROKEN_STATUS_END,
RESOURCE_BROKEN_STATUS_START,
USER_AGENT_STRING,
USER_AGENT_TIMEOUT,
WAYBACK_API_URL,
WAYBACK_CHECK_STATUS_URL,
WAYBACK_HEADERS,
)
from external_resources.exceptions import CheckFailedError
from websites.models import WebsiteContent
log = logging.getLogger()
def is_url_broken(url: str) -> tuple[bool, Optional[int]]:
"""Check if provided url is broken"""
if url.strip() == "":
return False, None
log.debug("Making a HEAD request for url: %s", url)
try:
response = requests.head(
url,
allow_redirects=True,
timeout=USER_AGENT_TIMEOUT,
headers={
"Accept": "*/*",
"User-Agent": USER_AGENT_STRING,
},
)
except Exception as ex:
log.debug(ex)
raise CheckFailedError from ex
if (
response.status_code >= RESOURCE_BROKEN_STATUS_START
and response.status_code < RESOURCE_BROKEN_STATUS_END
):
return True, response.status_code
return False, response.status_code
def is_external_url_broken(
external_resource: WebsiteContent,
) -> tuple[bool, Optional[int]]:
"""Check if external url of the provided WebsiteContent is broken"""
url = external_resource.metadata.get("external_url", "")
return is_url_broken(url)
def make_wayback_request(url: str, params: dict, headers: dict) -> dict:
"""
Make an API request to the Wayback Machine and return the response data.
"""
try:
response = requests.post(url, headers=headers, data=params, timeout=30)
response_data = response.json()
if "message" in response_data:
log.warning(
"Wayback Machine response message: %s", response_data["message"]
)
response.raise_for_status()
except requests.exceptions.RequestException:
log.exception("Error during Wayback Machine request to %s", url)
raise
else:
return response_data
def submit_url_to_wayback(
url: str,
) -> Optional[str]:
"""
Submit the external resource URL to the Wayback Machine and
return the response
"""
params = {
"url": url,
"skip_first_archive": "1",
}
return make_wayback_request(WAYBACK_API_URL, params, WAYBACK_HEADERS)
def check_wayback_jobs_status_batch(job_ids: list[str]) -> list[dict]:
"""
Check the status of multiple Wayback Machine jobs in batch.
"""
if not job_ids:
return []
params = {
"job_ids": ",".join(job_ids),
}
return make_wayback_request(WAYBACK_CHECK_STATUS_URL, params, WAYBACK_HEADERS)