From 3fa8baf9ee1556f613ccf0290ef8a0a1c87e49f7 Mon Sep 17 00:00:00 2001 From: Haresh Kainth Date: Fri, 7 Feb 2025 16:02:18 +0000 Subject: [PATCH] refactor:caching logic and improve error handling. Added robust error handling and logging for cache building processes. Refactored title fetching for related legislation into a reusable utility. Updated responses to provide more detailed timing and error information during cache rebuild operations. --- app/cache/legislation.py | 78 ++++++++++++++++++++++------ app/cache/manage_cache.py | 22 ++++++-- app/cache/public_gateway.py | 96 +++++++++++++---------------------- app/search/utils/documents.py | 68 ++++++++++++++++++++++++- app/search/utils/search.py | 1 - 5 files changed, 185 insertions(+), 80 deletions(-) diff --git a/app/cache/legislation.py b/app/cache/legislation.py index e6f7527..c5c0254 100644 --- a/app/cache/legislation.py +++ b/app/cache/legislation.py @@ -70,7 +70,30 @@ def _get_text_from_element(element: Optional[ET.Element]) -> Optional[str]: Optional[str]: The text content of the element if it exists, otherwise None. """ - return element.text if element is not None else None + try: + return element.text if element is not None else None + except Exception as e: + logger.error(f"error extracting text from element: {e}") + return None + + +def _update_result_dict(result_dict: dict, url, key: str, value): + """ + Update the result dictionary with the given key and value. + + Parameters: + - result_dict: Dictionary to update. + - key: Key to update in the dictionary. + - value: Value to assign to the key in the dictionary. + + Returns: + - None + """ + + datum = {"key": key, "value": value} + if url not in result_dict: + result_dict[url] = [] + result_dict[url].append(datum) class Legislation: @@ -123,7 +146,7 @@ def build_cache(self, config: SearchDocumentConfig): logger.info("building legislation cache...") dataset = construction_legislation_dataframe() - failed_url_fetches = [] + results_dict = {} # type: ignore # For each row, get the URL from the column named # 'URI to Extract XML Data' @@ -139,40 +162,57 @@ def build_cache(self, config: SearchDocumentConfig): data = _get_url_data(config, url) if data is None: - logger.error( - f"error fetching data from {url}. no data returned" - ) - raise Exception( - f"error fetching data from {url}. no data returned" - ) + _update_result_dict(results_dict, url, "FAILED", "no data returned") + if index == len(dataset) - 1: + logger.error("no more URLs to fetch. exiting the process...") + break + logger.warning("trying to fetch data from the next URL...") + continue if data: - logger.info(f"parsing data from {url}...") + logger.debug(f"parsing data from {url}...") root = ET.fromstring(data) # nosec BXXX + identifier = _get_text_from_element( root.find(".//dc:identifier", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:identifier", "key not found" if identifier is None else identifier) + + title = _get_text_from_element( root.find(".//dc:title", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:title", "key not found" if title is None else title) + description = _get_text_from_element( root.find(".//dc:description", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:description", "key not found" if description is None else description) + format = _get_text_from_element( root.find(".//dc:format", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:format", "key not found" if format is None else format) + language = _get_text_from_element( root.find(".//dc:language", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:language", "key not found" if language is None else language) + publisher = _get_text_from_element( root.find(".//dc:publisher", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:publisher", "key not found" if publisher is None else publisher) + modified = _get_text_from_element( root.find(".//dc:modified", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dc:modified", "key not found" if modified is None else modified) + valid = _get_text_from_element( root.find(".//dct:valid", self._namespaces) ) # nosec BXXX + _update_result_dict(results_dict, url, ".//dct:valid", "key not found" if valid is None else valid) document_json = self._to_json( description, @@ -187,15 +227,23 @@ def build_cache(self, config: SearchDocumentConfig): # Insert or update the document insert_or_update_document(document_json) - - # # Sleep for a short time to avoid rate limiting - # time.sleep(0.5) except Exception as e: + _update_result_dict(results_dict, url, "FAILED-EXCEPTION", e) + logger.error(f"error fetching data from {url}: {e}") - failed_url_fetches.append(url) + if index == len(dataset) - 1: + logger.error("no more URLs to fetch. exiting the process...") + break + logger.warning("trying to fetch data from the next URL...") + continue + + total_urls = len(results_dict) + failed_urls = len([url for url in results_dict if results_dict[url] is not None]) + failed_exception_urls = len([url for url in results_dict if results_dict[url] is not None and "FAILED-EXCEPTION" in results_dict[url]]) + logger.info(f"total URLs: {total_urls}") + logger.warning(f"failed URLs: {failed_urls}") + logger.warning(f"failed URLs with exception: {failed_exception_urls}") - if failed_url_fetches: - logger.warning(f"failed to fetch data {len(failed_url_fetches)} legislation sources: {failed_url_fetches}") def _to_json( self, description, diff --git a/app/cache/manage_cache.py b/app/cache/manage_cache.py index 0d98cdf..965e542 100644 --- a/app/cache/manage_cache.py +++ b/app/cache/manage_cache.py @@ -20,10 +20,26 @@ def rebuild_cache(): try: start = time.time() clear_all_documents() - config = SearchDocumentConfig(search_query="", timeout=1) + config = SearchDocumentConfig(search_query="", timeout=10) + + legislation_start = time.time() Legislation().build_cache(config) + legislation_end = time.time() + legislation_total = legislation_end - legislation_start + + public_gateway_start = time.time() PublicGateway().build_cache(config) + public_gateway_end = time.time() + public_gateway_total = public_gateway_end - public_gateway_start + end = time.time() - return {"message": "rebuilt cache", "duration": round(end - start, 2)} + return { + "message": "rebuilt cache", + "total duration": round(end - start, 2), + "details": { + "legislation": round(legislation_total, 2), + "public_gateway": round(public_gateway_total, 2), + }, + } except Exception as e: - return {"message": f"error clearing documents: {e}"} + return {"message": f"cache rebuild failed: {e}"} diff --git a/app/cache/public_gateway.py b/app/cache/public_gateway.py index 842dfd6..e11f118 100644 --- a/app/cache/public_gateway.py +++ b/app/cache/public_gateway.py @@ -5,12 +5,11 @@ import requests # type: ignore -from bs4 import BeautifulSoup - from app.search.utils.date import convert_date_string_to_obj from app.search.utils.documents import ( # noqa: E501 generate_short_uuid, insert_or_update_document, + update_related_legislation_titles, ) logger = logging.getLogger(__name__) @@ -39,43 +38,6 @@ def _build_like_conditions(field, and_terms, or_terms): return " OR ".join([f"{field} LIKE LOWER('%{term}%')" for term in terms]) -def _fetch_title_from_url(url): - """ - Fetches the title from the given URL. - - Args: - url (str): The URL to fetch the title from. - - Returns: - str: The title extracted from the meta tag or the page title. - """ - try: - # Ensure the URL has a schema - if not url.startswith(("http://", "https://")): - url = "https://" + url - - response = requests.get(url, timeout=2) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - - # Try to find the DC.title meta tag - title_tag = soup.find("meta", {"name": "DC.title"}) - if title_tag and title_tag.get("content"): - return title_tag["content"] - - # If DC.title is not found, search for pageTitle in the body - page_title = soup.select_one("#layout1 #layout2 #pageTitle") - if page_title: - return page_title.get_text(strip=True) - - logger.warning(f"title not found in {url}") - except Exception as e: - logger.error(f"error fetching title from {url}: {e}") - - # No title found therefore return empty string - return "" - - class PublicGateway: def __init__(self): """ @@ -85,8 +47,8 @@ def __init__(self): base_url (str): The base URL of the Trade Data API. """ self._base_url = ( - "https://data.api.trade.gov.uk/v1/datasets/fbr-regulations" - "/versions/v1.0.1/data" + "https://data.api.trade.gov.uk/v1/datasets/orp-regulations" + "/versions/latest/data" ) def build_cache(self, config): @@ -105,6 +67,8 @@ def build_cache(self, config): timeout=config.timeout, # nosec BXXX ) + inserted_document_count = 1 + # End time end_time = time.time() initial_request_system_time = end_time - start_time @@ -121,7 +85,7 @@ def build_cache(self, config): # Now you can use `data` as a usual Python dictionary # Convert each row into DataResponseModel object total_documents = len(data.get("uk_regulatory_documents")) - inserted_document_count = 1 + for row in data.get("uk_regulatory_documents"): # Start time start_time = time.time() @@ -159,23 +123,14 @@ def build_cache(self, config): "related_legislation" ].split("\n") - related_legislation = [] - for url in related_legislation_urls: - try: - title = _fetch_title_from_url(url) - except Exception as e: - logger.error( - f"error fetching title from {url}: {e}" - ) - title = "" - - related_legislation.append( - { - "url": url, - "title": title if title != "" else url, - } - ) - row["related_legislation"] = related_legislation + row["related_legislation"] = [ + { + "url": url.strip(), + "title": "", + } + for url in related_legislation_urls + if isinstance(url, str) and url.strip() + ] # End time end_time = time.time() @@ -186,8 +141,29 @@ def build_cache(self, config): ) insert_or_update_document(row) inserted_document_count += 1 - return response.status_code, inserted_document_count else: logger.error( f"error fetching data from orpd: {response.status_code}" ) + return 500, inserted_document_count + + # Update titles + process_code = response.status_code + try: + logger.debug("updating related legislation titles...") + update_titles_start_time = time.time() + update_related_legislation_titles(config) + update_titles_end_time = time.time() + update_titles_system_time = ( + update_titles_end_time - update_titles_start_time + ) + logger.info( + f"updating related legislation titles took " + f"{update_titles_system_time} seconds" + ) + except Exception as e: + logger.error(f"error updating related legislation titles: {e}") + process_code = 500 + + # return process_code, inserted_document_count + return process_code, inserted_document_count diff --git a/app/search/utils/documents.py b/app/search/utils/documents.py index c1e2b8d..fa1bea0 100644 --- a/app/search/utils/documents.py +++ b/app/search/utils/documents.py @@ -1,6 +1,10 @@ import base64 +import json import uuid +import requests # type: ignore + +from bs4 import BeautifulSoup from numpy.f2py.auxfuncs import throw_error from app.search.models import DataResponseModel, logger @@ -23,7 +27,69 @@ def clear_all_documents(): logger.debug("documents cleared from table") except Exception as e: logger.error(f"error clearing documents: {e}") - throw_error(f"error clearing documents: {e}") + + +def _fetch_title_from_url(config, url): + """ + Fetches the title from the given URL. + + Args: + url (str): The URL to fetch the title from. + + Returns: + str: The title extracted from the meta tag or the page title. + """ + try: + # Ensure the URL has a schema + if not url.startswith(("http://", "https://")): + url = "https://" + url + + logger.debug(f"fetching title from {url}") + response = requests.get(url, timeout=config.timeout) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + # Try to find the DC.title meta tag + title_tag = soup.find("meta", {"name": "DC.title"}) + if title_tag and title_tag.get("content"): + return title_tag["content"] + + # If DC.title is not found, search for pageTitle in the body + page_title = soup.select_one("#layout1 #layout2 #pageTitle") + if page_title: + return page_title.get_text(strip=True) + + logger.warning(f"title not found in {url}") + except Exception as e: + logger.error(f"error fetching title from {url}: {e}") + + # No title found therefore return empty string + return "" + + +def update_related_legislation_titles(config): + try: + documents = DataResponseModel.objects.all() + + for document in documents: + related = document.related_legislation + json_compatible_string = related.replace("'", '"') + related_legislation_list = json.loads(json_compatible_string) + + logger.debug( + f"related_legislation_list: {related_legislation_list}" + ) + + for related in related_legislation_list: + found_title = _fetch_title_from_url(config, related["url"]) + logger.debug(f"found title: {found_title}") + related["title"] = found_title + + document.related_legislation = json.dumps(related_legislation_list) + document.save() + except Exception as e: + logger.error(f"error updating related legislation titles: {e}") + throw_error(f"error updating related legislation titles: {e}") def insert_or_update_document(document_json): diff --git a/app/search/utils/search.py b/app/search/utils/search.py index 6322812..6ce0fcd 100644 --- a/app/search/utils/search.py +++ b/app/search/utils/search.py @@ -8,7 +8,6 @@ from django.contrib.postgres.search import ( SearchQuery, - SearchRank, SearchVector, ) # noqa from django.db.models import F, Func, Q, QuerySet