From 3fa8baf9ee1556f613ccf0290ef8a0a1c87e49f7 Mon Sep 17 00:00:00 2001
From: Haresh Kainth <haresh.kainth@businessandtrade.gov.uk>
Date: Fri, 7 Feb 2025 16:02:18 +0000
Subject: [PATCH] refactor:caching logic and improve error handling.

Added robust error handling and logging for cache building processes. Refactored title fetching for related legislation into a reusable utility. Updated responses to provide more detailed timing and error information during cache rebuild operations.
---
 app/cache/legislation.py      | 78 ++++++++++++++++++++++------
 app/cache/manage_cache.py     | 22 ++++++--
 app/cache/public_gateway.py   | 96 +++++++++++++----------------------
 app/search/utils/documents.py | 68 ++++++++++++++++++++++++-
 app/search/utils/search.py    |  1 -
 5 files changed, 185 insertions(+), 80 deletions(-)

diff --git a/app/cache/legislation.py b/app/cache/legislation.py
index e6f7527..c5c0254 100644
--- a/app/cache/legislation.py
+++ b/app/cache/legislation.py
@@ -70,7 +70,30 @@ def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]:
         Optional[str]:
             The text content of the element if it exists, otherwise None.
     """
-    return element.text if element is not None else None
+    try:
+        return element.text if element is not None else None
+    except Exception as e:
+        logger.error(f"error extracting text from element: {e}")
+        return None
+
+
+def _update_result_dict(result_dict: dict, url, key: str, value):
+    """
+    Update the result dictionary with the given key and value.
+
+    Parameters:
+    - result_dict: Dictionary to update.
+    - key: Key to update in the dictionary.
+    - value: Value to assign to the key in the dictionary.
+
+    Returns:
+    - None
+    """
+
+    datum = {"key": key, "value": value}
+    if url not in result_dict:
+        result_dict[url] = []
+    result_dict[url].append(datum)
 
 
 class Legislation:
@@ -123,7 +146,7 @@ def build_cache(self, config: SearchDocumentConfig):
         logger.info("building legislation cache...")
         dataset = construction_legislation_dataframe()
 
-        failed_url_fetches = []
+        results_dict = {}  # type: ignore
 
         # For each row, get the URL from the column named
         # 'URI to Extract XML Data'
@@ -139,40 +162,57 @@ def build_cache(self, config: SearchDocumentConfig):
                 data = _get_url_data(config, url)
 
                 if data is None:
-                    logger.error(
-                        f"error fetching data from {url}. no data returned"
-                    )
-                    raise Exception(
-                        f"error fetching data from {url}. no data returned"
-                    )
+                    _update_result_dict(results_dict, url, "FAILED", "no data returned")
+                    if index == len(dataset) - 1:
+                        logger.error("no more URLs to fetch. exiting the process...")
+                        break
+                    logger.warning("trying to fetch data from the next URL...")
+                    continue
 
                 if data:
-                    logger.info(f"parsing data from {url}...")
+                    logger.debug(f"parsing data from {url}...")
                     root = ET.fromstring(data)  # nosec BXXX
+
                     identifier = _get_text_from_element(
                         root.find(".//dc:identifier", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:identifier", "key not found" if identifier is None else identifier)
+
+
                     title = _get_text_from_element(
                         root.find(".//dc:title", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:title", "key not found" if title is None else title)
+
                     description = _get_text_from_element(
                         root.find(".//dc:description", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:description", "key not found" if description is None else description)
+
                     format = _get_text_from_element(
                         root.find(".//dc:format", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:format", "key not found" if format is None else format)
+
                     language = _get_text_from_element(
                         root.find(".//dc:language", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:language", "key not found" if language is None else language)
+
                     publisher = _get_text_from_element(
                         root.find(".//dc:publisher", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:publisher", "key not found" if publisher is None else publisher)
+
                     modified = _get_text_from_element(
                         root.find(".//dc:modified", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dc:modified", "key not found" if modified is None else modified)
+
                     valid = _get_text_from_element(
                         root.find(".//dct:valid", self._namespaces)
                     )  # nosec BXXX
+                    _update_result_dict(results_dict, url, ".//dct:valid", "key not found" if valid is None else valid)
 
                     document_json = self._to_json(
                         description,
@@ -187,15 +227,23 @@ def build_cache(self, config: SearchDocumentConfig):
 
                     # Insert or update the document
                     insert_or_update_document(document_json)
-
-                # # Sleep for a short time to avoid rate limiting
-                # time.sleep(0.5)
             except Exception as e:
+                _update_result_dict(results_dict, url, "FAILED-EXCEPTION", e)
+
                 logger.error(f"error fetching data from {url}: {e}")
-                failed_url_fetches.append(url)
+                if index == len(dataset) - 1:
+                    logger.error("no more URLs to fetch. exiting the process...")
+                    break
+                logger.warning("trying to fetch data from the next URL...")
+                continue
+
+        total_urls = len(results_dict)
+        failed_urls = len([url for url in results_dict if results_dict[url] is not None])
+        failed_exception_urls = len([url for url in results_dict if results_dict[url] is not None and "FAILED-EXCEPTION" in results_dict[url]])
+        logger.info(f"total URLs: {total_urls}")
+        logger.warning(f"failed URLs: {failed_urls}")
+        logger.warning(f"failed URLs with exception: {failed_exception_urls}")
 
-        if failed_url_fetches:
-            logger.warning(f"failed to fetch data {len(failed_url_fetches)} legislation sources: {failed_url_fetches}")
     def _to_json(
         self,
         description,
diff --git a/app/cache/manage_cache.py b/app/cache/manage_cache.py
index 0d98cdf..965e542 100644
--- a/app/cache/manage_cache.py
+++ b/app/cache/manage_cache.py
@@ -20,10 +20,26 @@ def rebuild_cache():
     try:
         start = time.time()
         clear_all_documents()
-        config = SearchDocumentConfig(search_query="", timeout=1)
+        config = SearchDocumentConfig(search_query="", timeout=10)
+
+        legislation_start = time.time()
         Legislation().build_cache(config)
+        legislation_end = time.time()
+        legislation_total = legislation_end - legislation_start
+
+        public_gateway_start = time.time()
         PublicGateway().build_cache(config)
+        public_gateway_end = time.time()
+        public_gateway_total = public_gateway_end - public_gateway_start
+
         end = time.time()
-        return {"message": "rebuilt cache", "duration": round(end - start, 2)}
+        return {
+            "message": "rebuilt cache",
+            "total duration": round(end - start, 2),
+            "details": {
+                "legislation": round(legislation_total, 2),
+                "public_gateway": round(public_gateway_total, 2),
+            },
+        }
     except Exception as e:
-        return {"message": f"error clearing documents: {e}"}
+        return {"message": f"cache rebuild failed: {e}"}
diff --git a/app/cache/public_gateway.py b/app/cache/public_gateway.py
index 842dfd6..e11f118 100644
--- a/app/cache/public_gateway.py
+++ b/app/cache/public_gateway.py
@@ -5,12 +5,11 @@
 
 import requests  # type: ignore
 
-from bs4 import BeautifulSoup
-
 from app.search.utils.date import convert_date_string_to_obj
 from app.search.utils.documents import (  # noqa: E501
     generate_short_uuid,
     insert_or_update_document,
+    update_related_legislation_titles,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,43 +38,6 @@ def _build_like_conditions(field, and_terms, or_terms):
     return " OR ".join([f"{field} LIKE LOWER('%{term}%')" for term in terms])
 
 
-def _fetch_title_from_url(url):
-    """
-    Fetches the title from the given URL.
-
-    Args:
-        url (str): The URL to fetch the title from.
-
-    Returns:
-        str: The title extracted from the meta tag or the page title.
-    """
-    try:
-        # Ensure the URL has a schema
-        if not url.startswith(("http://", "https://")):
-            url = "https://" + url
-
-        response = requests.get(url, timeout=2)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, "html.parser")
-
-        # Try to find the DC.title meta tag
-        title_tag = soup.find("meta", {"name": "DC.title"})
-        if title_tag and title_tag.get("content"):
-            return title_tag["content"]
-
-        # If DC.title is not found, search for pageTitle in the body
-        page_title = soup.select_one("#layout1 #layout2 #pageTitle")
-        if page_title:
-            return page_title.get_text(strip=True)
-
-        logger.warning(f"title not found in {url}")
-    except Exception as e:
-        logger.error(f"error fetching title from {url}: {e}")
-
-    # No title found therefore return empty string
-    return ""
-
-
 class PublicGateway:
     def __init__(self):
         """
@@ -85,8 +47,8 @@ def __init__(self):
             base_url (str): The base URL of the Trade Data API.
         """
         self._base_url = (
-            "https://data.api.trade.gov.uk/v1/datasets/fbr-regulations"
-            "/versions/v1.0.1/data"
+            "https://data.api.trade.gov.uk/v1/datasets/orp-regulations"
+            "/versions/latest/data"
         )
 
     def build_cache(self, config):
@@ -105,6 +67,8 @@ def build_cache(self, config):
             timeout=config.timeout,  # nosec BXXX
         )
 
+        inserted_document_count = 1
+
         # End time
         end_time = time.time()
         initial_request_system_time = end_time - start_time
@@ -121,7 +85,7 @@ def build_cache(self, config):
             # Now you can use `data` as a usual Python dictionary
             # Convert each row into DataResponseModel object
             total_documents = len(data.get("uk_regulatory_documents"))
-            inserted_document_count = 1
+
             for row in data.get("uk_regulatory_documents"):
                 # Start time
                 start_time = time.time()
@@ -159,23 +123,14 @@ def build_cache(self, config):
                         "related_legislation"
                     ].split("\n")
 
-                    related_legislation = []
-                    for url in related_legislation_urls:
-                        try:
-                            title = _fetch_title_from_url(url)
-                        except Exception as e:
-                            logger.error(
-                                f"error fetching title from {url}: {e}"
-                            )
-                            title = ""
-
-                        related_legislation.append(
-                            {
-                                "url": url,
-                                "title": title if title != "" else url,
-                            }
-                        )
-                    row["related_legislation"] = related_legislation
+                    row["related_legislation"] = [
+                        {
+                            "url": url.strip(),
+                            "title": "",
+                        }
+                        for url in related_legislation_urls
+                        if isinstance(url, str) and url.strip()
+                    ]
 
                 # End time
                 end_time = time.time()
@@ -186,8 +141,29 @@ def build_cache(self, config):
                 )
                 insert_or_update_document(row)
                 inserted_document_count += 1
-            return response.status_code, inserted_document_count
         else:
             logger.error(
                 f"error fetching data from orpd: {response.status_code}"
             )
+            return 500, inserted_document_count
+
+        # Update titles
+        process_code = response.status_code
+        try:
+            logger.debug("updating related legislation titles...")
+            update_titles_start_time = time.time()
+            update_related_legislation_titles(config)
+            update_titles_end_time = time.time()
+            update_titles_system_time = (
+                update_titles_end_time - update_titles_start_time
+            )
+            logger.info(
+                f"updating related legislation titles took "
+                f"{update_titles_system_time} seconds"
+            )
+        except Exception as e:
+            logger.error(f"error updating related legislation titles: {e}")
+            process_code = 500
+
+        # return process_code, inserted_document_count
+        return process_code, inserted_document_count
diff --git a/app/search/utils/documents.py b/app/search/utils/documents.py
index c1e2b8d..fa1bea0 100644
--- a/app/search/utils/documents.py
+++ b/app/search/utils/documents.py
@@ -1,6 +1,10 @@
 import base64
+import json
 import uuid
 
+import requests  # type: ignore
+
+from bs4 import BeautifulSoup
 from numpy.f2py.auxfuncs import throw_error
 
 from app.search.models import DataResponseModel, logger
@@ -23,7 +27,69 @@ def clear_all_documents():
         logger.debug("documents cleared from table")
     except Exception as e:
         logger.error(f"error clearing documents: {e}")
-        throw_error(f"error clearing documents: {e}")
+
+
+def _fetch_title_from_url(config, url):
+    """
+    Fetches the title from the given URL.
+
+    Args:
+        url (str): The URL to fetch the title from.
+
+    Returns:
+        str: The title extracted from the meta tag or the page title.
+    """
+    try:
+        # Ensure the URL has a schema
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
+
+        logger.debug(f"fetching title from {url}")
+        response = requests.get(url, timeout=config.timeout)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        # Try to find the DC.title meta tag
+        title_tag = soup.find("meta", {"name": "DC.title"})
+        if title_tag and title_tag.get("content"):
+            return title_tag["content"]
+
+        # If DC.title is not found, search for pageTitle in the body
+        page_title = soup.select_one("#layout1 #layout2 #pageTitle")
+        if page_title:
+            return page_title.get_text(strip=True)
+
+        logger.warning(f"title not found in {url}")
+    except Exception as e:
+        logger.error(f"error fetching title from {url}: {e}")
+
+    # No title found therefore return empty string
+    return ""
+
+
+def update_related_legislation_titles(config):
+    try:
+        documents = DataResponseModel.objects.all()
+
+        for document in documents:
+            related = document.related_legislation
+            json_compatible_string = related.replace("'", '"')
+            related_legislation_list = json.loads(json_compatible_string)
+
+            logger.debug(
+                f"related_legislation_list: {related_legislation_list}"
+            )
+
+            for related in related_legislation_list:
+                found_title = _fetch_title_from_url(config, related["url"])
+                logger.debug(f"found title: {found_title}")
+                related["title"] = found_title
+
+            document.related_legislation = json.dumps(related_legislation_list)
+            document.save()
+    except Exception as e:
+        logger.error(f"error updating related legislation titles: {e}")
+        throw_error(f"error updating related legislation titles: {e}")
 
 
 def insert_or_update_document(document_json):
diff --git a/app/search/utils/search.py b/app/search/utils/search.py
index 6322812..6ce0fcd 100644
--- a/app/search/utils/search.py
+++ b/app/search/utils/search.py
@@ -8,7 +8,6 @@
 
 from django.contrib.postgres.search import (
     SearchQuery,
-    SearchRank,
     SearchVector,
 )  # noqa
 from django.db.models import F, Func, Q, QuerySet