Skip to content

Commit

Permalink
Prefer file extension over content headers (#74)
Browse files Browse the repository at this point in the history
When determining content type the current behaviour is to trust the
`Content-Type` header. However, we have recieved a number of documents
with the wrong value set for this.

File extensions are arguably slightly more trustworthy then the content
type header, so we are opting to prefer it when one matches a supported
file type
  • Loading branch information
olaughter authored Apr 23, 2024
1 parent a6bcbcb commit 16a1a9e
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 2 deletions.
4 changes: 3 additions & 1 deletion src/navigator_data_ingest/base/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from tenacity.stop import stop_after_attempt
from tenacity.wait import wait_random_exponential

from navigator_data_ingest.base.utils import determine_content_type
from navigator_data_ingest.base.types import (
MULTI_FILE_CONTENT_TYPES,
SUPPORTED_CONTENT_TYPES,
Expand Down Expand Up @@ -53,7 +54,8 @@ def upload_document(

try:
download_response = _download_from_source(session, source_url)
content_type = download_response.headers["Content-Type"].split(";")[0]
content_type = determine_content_type(download_response, source_url)

# Update the result object with the detected content type
upload_result.content_type = content_type

Expand Down
2 changes: 2 additions & 0 deletions src/navigator_data_ingest/base/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class DocumentType(str, Enum):
CONTENT_TYPE_HTML: ".html",
CONTENT_TYPE_DOCX: ".docx",
}
# Reversed mapping to get content types from file extensions
CONTENT_TYPE_MAPPING = {v: k for k, v in FILE_EXTENSION_MAPPING.items()}


class Event(BaseModel): # noqa: D101
Expand Down
20 changes: 19 additions & 1 deletion src/navigator_data_ingest/base/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from typing import cast

from cloudpathlib import CloudPath, S3Path
from requests import Response

from navigator_data_ingest.base.types import DocumentGenerator
from navigator_data_ingest.base.types import DocumentGenerator, CONTENT_TYPE_MAPPING
from cpr_data_access.pipeline_general_models import (
Update,
PipelineUpdates,
Expand Down Expand Up @@ -76,3 +77,20 @@ def parser_input_already_exists(
)
return True
return False


def determine_content_type(response: Response, source_url: str) -> str:
"""Use the response headers and file extension to determine content type
Args:
response (Response): the request object from the file download
source_url (str): The defined source url
Returns:
str: chosen content type
"""

content_type_header = response.headers["Content-Type"].split(";")[0]
file_extension_start_index = source_url.rindex(".")
file_extension = source_url[file_extension_start_index:]
return CONTENT_TYPE_MAPPING.get(file_extension, content_type_header)
24 changes: 24 additions & 0 deletions src/navigator_data_ingest/tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from requests import Response
import pytest

from navigator_data_ingest.base.types import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF
from navigator_data_ingest.base.utils import determine_content_type


@pytest.mark.parametrize(
("content_type", "source_url", "want"),
(
["text/html", "https://aweb.site/file", CONTENT_TYPE_HTML],
["text/html", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
["application/pdf", "https://aweb.site/file", CONTENT_TYPE_PDF],
["application/pdf", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
["", "https://aweb.site/file.pdf", CONTENT_TYPE_PDF],
["", "https://aweb.site/file", ""],
)
)
def test_determine_content_type(content_type, source_url, want):
test_response = Response()
test_response.headers["Content-Type"] = content_type

got = determine_content_type(test_response, source_url)
assert got == want

0 comments on commit 16a1a9e

Please sign in to comment.