diff --git a/src/navigator_data_ingest/base/api_client.py b/src/navigator_data_ingest/base/api_client.py index 42e6618..cb8f494 100644 --- a/src/navigator_data_ingest/base/api_client.py +++ b/src/navigator_data_ingest/base/api_client.py @@ -11,6 +11,7 @@ from tenacity.stop import stop_after_attempt from tenacity.wait import wait_random_exponential +from navigator_data_ingest.base.utils import determine_content_type from navigator_data_ingest.base.types import ( MULTI_FILE_CONTENT_TYPES, SUPPORTED_CONTENT_TYPES, @@ -53,7 +54,8 @@ def upload_document( try: download_response = _download_from_source(session, source_url) - content_type = download_response.headers["Content-Type"].split(";")[0] + content_type = determine_content_type(download_response, source_url) + # Update the result object with the detected content type upload_result.content_type = content_type diff --git a/src/navigator_data_ingest/base/types.py b/src/navigator_data_ingest/base/types.py index 2903b53..34b331c 100644 --- a/src/navigator_data_ingest/base/types.py +++ b/src/navigator_data_ingest/base/types.py @@ -47,6 +47,8 @@ class DocumentType(str, Enum): CONTENT_TYPE_HTML: ".html", CONTENT_TYPE_DOCX: ".docx", } +# Reversed mapping to get content types from file extensions +CONTENT_TYPE_MAPPING = {v: k for k, v in FILE_EXTENSION_MAPPING.items()} class Event(BaseModel): # noqa: D101 diff --git a/src/navigator_data_ingest/base/utils.py b/src/navigator_data_ingest/base/utils.py index b5d8108..b484443 100644 --- a/src/navigator_data_ingest/base/utils.py +++ b/src/navigator_data_ingest/base/utils.py @@ -4,8 +4,9 @@ from typing import cast from cloudpathlib import CloudPath, S3Path +from requests import Response -from navigator_data_ingest.base.types import DocumentGenerator +from navigator_data_ingest.base.types import DocumentGenerator, CONTENT_TYPE_MAPPING from cpr_data_access.pipeline_general_models import ( Update, PipelineUpdates, @@ -76,3 +77,20 @@ def parser_input_already_exists( ) return True return False + + +def determine_content_type(response: Response, source_url: str) -> str: + """Use the response headers and file extension to determine content type + + Args: + response (Response): the request object from the file download + source_url (str): The defined source url + + Returns: + str: chosen content type + """ + + content_type_header = response.headers["Content-Type"].split(";")[0] + file_extension_start_index = source_url.rindex(".") + file_extension = source_url[file_extension_start_index:] + return CONTENT_TYPE_MAPPING.get(file_extension, content_type_header) diff --git a/src/navigator_data_ingest/tests/utils.py b/src/navigator_data_ingest/tests/utils.py new file mode 100644 index 0000000..dfa21e4 --- /dev/null +++ b/src/navigator_data_ingest/tests/utils.py @@ -0,0 +1,24 @@ +from requests import Response +import pytest + +from navigator_data_ingest.base.types import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF +from navigator_data_ingest.base.utils import determine_content_type + + +@pytest.mark.parametrize( + ("content_type", "source_url", "want"), + ( + ["text/html", "https://aweb.site/file", CONTENT_TYPE_HTML], + ["text/html", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF], + ["application/pdf", "https://aweb.site/file", CONTENT_TYPE_PDF], + ["application/pdf", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF], + ["", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF], + ["", "https://aweb.site/file", ""], + ) +) +def test_determine_content_type(content_type, source_url, want): + test_response = Response() + test_response.headers["Content-Type"] = content_type + + got = determine_content_type(test_response, source_url) + assert got == want