Skip to content

Commit

Permalink
refactor:caching logic and improve error handling.
Browse files Browse the repository at this point in the history
Added robust error handling and logging for cache building processes. Refactored title fetching for related legislation into a reusable utility. Updated responses to provide more detailed timing and error information during cache rebuild operations.
  • Loading branch information
hareshkainthdbt committed Feb 7, 2025
1 parent 826b8f3 commit 3fa8baf
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 80 deletions.
78 changes: 63 additions & 15 deletions app/cache/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,30 @@ def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]:
Optional[str]:
The text content of the element if it exists, otherwise None.
"""
return element.text if element is not None else None
try:
return element.text if element is not None else None
except Exception as e:
logger.error(f"error extracting text from element: {e}")
return None


def _update_result_dict(result_dict: dict, url, key: str, value):
"""
Update the result dictionary with the given key and value.
Parameters:
- result_dict: Dictionary to update.
- key: Key to update in the dictionary.
- value: Value to assign to the key in the dictionary.
Returns:
- None
"""

datum = {"key": key, "value": value}
if url not in result_dict:
result_dict[url] = []
result_dict[url].append(datum)


class Legislation:
Expand Down Expand Up @@ -123,7 +146,7 @@ def build_cache(self, config: SearchDocumentConfig):
logger.info("building legislation cache...")
dataset = construction_legislation_dataframe()

failed_url_fetches = []
results_dict = {} # type: ignore

# For each row, get the URL from the column named
# 'URI to Extract XML Data'
Expand All @@ -139,40 +162,57 @@ def build_cache(self, config: SearchDocumentConfig):
data = _get_url_data(config, url)

if data is None:
logger.error(
f"error fetching data from {url}. no data returned"
)
raise Exception(
f"error fetching data from {url}. no data returned"
)
_update_result_dict(results_dict, url, "FAILED", "no data returned")
if index == len(dataset) - 1:
logger.error("no more URLs to fetch. exiting the process...")
break
logger.warning("trying to fetch data from the next URL...")
continue

if data:
logger.info(f"parsing data from {url}...")
logger.debug(f"parsing data from {url}...")
root = ET.fromstring(data) # nosec BXXX

identifier = _get_text_from_element(
root.find(".//dc:identifier", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:identifier", "key not found" if identifier is None else identifier)


title = _get_text_from_element(
root.find(".//dc:title", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:title", "key not found" if title is None else title)

description = _get_text_from_element(
root.find(".//dc:description", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:description", "key not found" if description is None else description)

format = _get_text_from_element(
root.find(".//dc:format", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:format", "key not found" if format is None else format)

language = _get_text_from_element(
root.find(".//dc:language", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:language", "key not found" if language is None else language)

publisher = _get_text_from_element(
root.find(".//dc:publisher", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:publisher", "key not found" if publisher is None else publisher)

modified = _get_text_from_element(
root.find(".//dc:modified", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dc:modified", "key not found" if modified is None else modified)

valid = _get_text_from_element(
root.find(".//dct:valid", self._namespaces)
) # nosec BXXX
_update_result_dict(results_dict, url, ".//dct:valid", "key not found" if valid is None else valid)

document_json = self._to_json(
description,
Expand All @@ -187,15 +227,23 @@ def build_cache(self, config: SearchDocumentConfig):

# Insert or update the document
insert_or_update_document(document_json)

# # Sleep for a short time to avoid rate limiting
# time.sleep(0.5)
except Exception as e:
_update_result_dict(results_dict, url, "FAILED-EXCEPTION", e)

logger.error(f"error fetching data from {url}: {e}")
failed_url_fetches.append(url)
if index == len(dataset) - 1:
logger.error("no more URLs to fetch. exiting the process...")
break
logger.warning("trying to fetch data from the next URL...")
continue

total_urls = len(results_dict)
failed_urls = len([url for url in results_dict if results_dict[url] is not None])
failed_exception_urls = len([url for url in results_dict if results_dict[url] is not None and "FAILED-EXCEPTION" in results_dict[url]])
logger.info(f"total URLs: {total_urls}")
logger.warning(f"failed URLs: {failed_urls}")
logger.warning(f"failed URLs with exception: {failed_exception_urls}")

if failed_url_fetches:
logger.warning(f"failed to fetch data {len(failed_url_fetches)} legislation sources: {failed_url_fetches}")
def _to_json(
self,
description,
Expand Down
22 changes: 19 additions & 3 deletions app/cache/manage_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,26 @@ def rebuild_cache():
try:
start = time.time()
clear_all_documents()
config = SearchDocumentConfig(search_query="", timeout=1)
config = SearchDocumentConfig(search_query="", timeout=10)

legislation_start = time.time()
Legislation().build_cache(config)
legislation_end = time.time()
legislation_total = legislation_end - legislation_start

public_gateway_start = time.time()
PublicGateway().build_cache(config)
public_gateway_end = time.time()
public_gateway_total = public_gateway_end - public_gateway_start

end = time.time()
return {"message": "rebuilt cache", "duration": round(end - start, 2)}
return {
"message": "rebuilt cache",
"total duration": round(end - start, 2),
"details": {
"legislation": round(legislation_total, 2),
"public_gateway": round(public_gateway_total, 2),
},
}
except Exception as e:
return {"message": f"error clearing documents: {e}"}
return {"message": f"cache rebuild failed: {e}"}
96 changes: 36 additions & 60 deletions app/cache/public_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

import requests # type: ignore

from bs4 import BeautifulSoup

from app.search.utils.date import convert_date_string_to_obj
from app.search.utils.documents import ( # noqa: E501
generate_short_uuid,
insert_or_update_document,
update_related_legislation_titles,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,43 +38,6 @@ def _build_like_conditions(field, and_terms, or_terms):
return " OR ".join([f"{field} LIKE LOWER('%{term}%')" for term in terms])


def _fetch_title_from_url(url):
"""
Fetches the title from the given URL.
Args:
url (str): The URL to fetch the title from.
Returns:
str: The title extracted from the meta tag or the page title.
"""
try:
# Ensure the URL has a schema
if not url.startswith(("http://", "https://")):
url = "https://" + url

response = requests.get(url, timeout=2)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# Try to find the DC.title meta tag
title_tag = soup.find("meta", {"name": "DC.title"})
if title_tag and title_tag.get("content"):
return title_tag["content"]

# If DC.title is not found, search for pageTitle in the body
page_title = soup.select_one("#layout1 #layout2 #pageTitle")
if page_title:
return page_title.get_text(strip=True)

logger.warning(f"title not found in {url}")
except Exception as e:
logger.error(f"error fetching title from {url}: {e}")

# No title found therefore return empty string
return ""


class PublicGateway:
def __init__(self):
"""
Expand All @@ -85,8 +47,8 @@ def __init__(self):
base_url (str): The base URL of the Trade Data API.
"""
self._base_url = (
"https://data.api.trade.gov.uk/v1/datasets/fbr-regulations"
"/versions/v1.0.1/data"
"https://data.api.trade.gov.uk/v1/datasets/orp-regulations"
"/versions/latest/data"
)

def build_cache(self, config):
Expand All @@ -105,6 +67,8 @@ def build_cache(self, config):
timeout=config.timeout, # nosec BXXX
)

inserted_document_count = 1

# End time
end_time = time.time()
initial_request_system_time = end_time - start_time
Expand All @@ -121,7 +85,7 @@ def build_cache(self, config):
# Now you can use `data` as a usual Python dictionary
# Convert each row into DataResponseModel object
total_documents = len(data.get("uk_regulatory_documents"))
inserted_document_count = 1

for row in data.get("uk_regulatory_documents"):
# Start time
start_time = time.time()
Expand Down Expand Up @@ -159,23 +123,14 @@ def build_cache(self, config):
"related_legislation"
].split("\n")

related_legislation = []
for url in related_legislation_urls:
try:
title = _fetch_title_from_url(url)
except Exception as e:
logger.error(
f"error fetching title from {url}: {e}"
)
title = ""

related_legislation.append(
{
"url": url,
"title": title if title != "" else url,
}
)
row["related_legislation"] = related_legislation
row["related_legislation"] = [
{
"url": url.strip(),
"title": "",
}
for url in related_legislation_urls
if isinstance(url, str) and url.strip()
]

# End time
end_time = time.time()
Expand All @@ -186,8 +141,29 @@ def build_cache(self, config):
)
insert_or_update_document(row)
inserted_document_count += 1
return response.status_code, inserted_document_count
else:
logger.error(
f"error fetching data from orpd: {response.status_code}"
)
return 500, inserted_document_count

# Update titles
process_code = response.status_code
try:
logger.debug("updating related legislation titles...")
update_titles_start_time = time.time()
update_related_legislation_titles(config)
update_titles_end_time = time.time()
update_titles_system_time = (
update_titles_end_time - update_titles_start_time
)
logger.info(
f"updating related legislation titles took "
f"{update_titles_system_time} seconds"
)
except Exception as e:
logger.error(f"error updating related legislation titles: {e}")
process_code = 500

# return process_code, inserted_document_count
return process_code, inserted_document_count
68 changes: 67 additions & 1 deletion app/search/utils/documents.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import base64
import json
import uuid

import requests # type: ignore

from bs4 import BeautifulSoup
from numpy.f2py.auxfuncs import throw_error

from app.search.models import DataResponseModel, logger
Expand All @@ -23,7 +27,69 @@ def clear_all_documents():
logger.debug("documents cleared from table")
except Exception as e:
logger.error(f"error clearing documents: {e}")
throw_error(f"error clearing documents: {e}")


def _fetch_title_from_url(config, url):
"""
Fetches the title from the given URL.
Args:
url (str): The URL to fetch the title from.
Returns:
str: The title extracted from the meta tag or the page title.
"""
try:
# Ensure the URL has a schema
if not url.startswith(("http://", "https://")):
url = "https://" + url

logger.debug(f"fetching title from {url}")
response = requests.get(url, timeout=config.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# Try to find the DC.title meta tag
title_tag = soup.find("meta", {"name": "DC.title"})
if title_tag and title_tag.get("content"):
return title_tag["content"]

# If DC.title is not found, search for pageTitle in the body
page_title = soup.select_one("#layout1 #layout2 #pageTitle")
if page_title:
return page_title.get_text(strip=True)

logger.warning(f"title not found in {url}")
except Exception as e:
logger.error(f"error fetching title from {url}: {e}")

# No title found therefore return empty string
return ""


def update_related_legislation_titles(config):
try:
documents = DataResponseModel.objects.all()

for document in documents:
related = document.related_legislation
json_compatible_string = related.replace("'", '"')
related_legislation_list = json.loads(json_compatible_string)

logger.debug(
f"related_legislation_list: {related_legislation_list}"
)

for related in related_legislation_list:
found_title = _fetch_title_from_url(config, related["url"])
logger.debug(f"found title: {found_title}")
related["title"] = found_title

document.related_legislation = json.dumps(related_legislation_list)
document.save()
except Exception as e:
logger.error(f"error updating related legislation titles: {e}")
throw_error(f"error updating related legislation titles: {e}")


def insert_or_update_document(document_json):
Expand Down
Loading

0 comments on commit 3fa8baf

Please sign in to comment.