Prefer file extension over content headers (#74)

When determining content type the current behaviour is to trust the `Content-Type` header. However, we have recieved a number of documents with the wrong value set for this. File extensions are arguably slightly more trustworthy then the content type header, so we are opting to prefer it when one matches a supported file type
climatepolicyradar · Apr 23, 2024 · 16a1a9e · 16a1a9e
1 parent a6bcbcb
commit 16a1a9e
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 2 deletions.
diff --git a/src/navigator_data_ingest/base/api_client.py b/src/navigator_data_ingest/base/api_client.py
@@ -11,6 +11,7 @@
 from tenacity.stop import stop_after_attempt
 from tenacity.wait import wait_random_exponential
 
+from navigator_data_ingest.base.utils import determine_content_type
 from navigator_data_ingest.base.types import (
     MULTI_FILE_CONTENT_TYPES,
     SUPPORTED_CONTENT_TYPES,
@@ -53,7 +54,8 @@ def upload_document(
 
     try:
         download_response = _download_from_source(session, source_url)
-        content_type = download_response.headers["Content-Type"].split(";")[0]
+        content_type = determine_content_type(download_response, source_url)
+
         # Update the result object with the detected content type
         upload_result.content_type = content_type
 

diff --git a/src/navigator_data_ingest/base/types.py b/src/navigator_data_ingest/base/types.py
@@ -47,6 +47,8 @@ class DocumentType(str, Enum):
     CONTENT_TYPE_HTML: ".html",
     CONTENT_TYPE_DOCX: ".docx",
 }
+# Reversed mapping to get content types from file extensions
+CONTENT_TYPE_MAPPING = {v: k for k, v in FILE_EXTENSION_MAPPING.items()}
 
 
 class Event(BaseModel):  # noqa: D101

diff --git a/src/navigator_data_ingest/base/utils.py b/src/navigator_data_ingest/base/utils.py
@@ -4,8 +4,9 @@
 from typing import cast
 
 from cloudpathlib import CloudPath, S3Path
+from requests import Response
 
-from navigator_data_ingest.base.types import DocumentGenerator
+from navigator_data_ingest.base.types import DocumentGenerator, CONTENT_TYPE_MAPPING
 from cpr_data_access.pipeline_general_models import (
     Update,
     PipelineUpdates,
@@ -76,3 +77,20 @@ def parser_input_already_exists(
         )
         return True
     return False
+
+
+def determine_content_type(response: Response, source_url: str) -> str:
+    """Use the response headers and file extension to determine content type
+
+    Args:
+        response (Response): the request object from the file download
+        source_url (str): The defined source url
+
+    Returns:
+        str: chosen content type
+    """
+
+    content_type_header = response.headers["Content-Type"].split(";")[0]
+    file_extension_start_index = source_url.rindex(".")
+    file_extension = source_url[file_extension_start_index:]
+    return CONTENT_TYPE_MAPPING.get(file_extension, content_type_header)
diff --git a/src/navigator_data_ingest/tests/utils.py b/src/navigator_data_ingest/tests/utils.py
@@ -0,0 +1,24 @@
+from requests import Response
+import pytest
+
+from navigator_data_ingest.base.types import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF
+from navigator_data_ingest.base.utils import determine_content_type
+
+
+@pytest.mark.parametrize(
+    ("content_type", "source_url", "want"),
+    (
+        ["text/html", "https://aweb.site/file", CONTENT_TYPE_HTML],
+        ["text/html", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
+        ["application/pdf", "https://aweb.site/file", CONTENT_TYPE_PDF],
+        ["application/pdf", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
+        ["", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
+        ["", "https://aweb.site/file", ""],
+    )
+)
+def test_determine_content_type(content_type, source_url, want):
+    test_response = Response()
+    test_response.headers["Content-Type"] = content_type
+
+    got = determine_content_type(test_response, source_url)
+    assert got == want