From 9eb44e6f62134dc40eac774d6ab4835a58f18569 Mon Sep 17 00:00:00 2001 From: Natasha Boyse Date: Thu, 27 Feb 2025 12:04:38 +0000 Subject: [PATCH] wip --- .../redbox_core/views/document_views.py | 49 ++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/django_app/redbox_app/redbox_core/views/document_views.py b/django_app/redbox_app/redbox_core/views/document_views.py index 5dafde0b..f6943ab7 100644 --- a/django_app/redbox_app/redbox_core/views/document_views.py +++ b/django_app/redbox_app/redbox_core/views/document_views.py @@ -1,12 +1,13 @@ import logging import uuid from collections.abc import MutableSequence, Sequence +from io import BytesIO from pathlib import Path from django.contrib.auth import get_user_model from django.contrib.auth.decorators import login_required from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError -from django.core.files.uploadedfile import UploadedFile +from django.core.files.uploadedfile import InMemoryUploadedFile, UploadedFile from django.http import HttpRequest, HttpResponse, JsonResponse from django.shortcuts import get_object_or_404, redirect, render from django.urls import reverse @@ -82,8 +83,10 @@ def post(self, request: HttpRequest) -> HttpResponse: if not uploaded_files: errors.append("No document selected") - for uploaded_file in uploaded_files: + for index, uploaded_file in enumerate(uploaded_files): errors += self.validate_uploaded_file(uploaded_file) + if not self.is_utf8_compatible(uploaded_file): + uploaded_files[index] = self.convert_to_utf8(uploaded_file) if not errors: for uploaded_file in uploaded_files: @@ -108,22 +111,56 @@ def build_response(request: HttpRequest, errors: Sequence[str] | None = None) -> @staticmethod def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]: errors: MutableSequence[str] = [] - if not uploaded_file.name: errors.append("File has no name") else: file_extension = Path(uploaded_file.name).suffix if file_extension.lower() not in APPROVED_FILE_EXTENSIONS: errors.append(f"Error with {uploaded_file.name}: File type {file_extension} not supported") - if not uploaded_file.content_type: errors.append(f"Error with {uploaded_file.name}: File has no content-type") - if uploaded_file.size > MAX_FILE_SIZE: errors.append(f"Error with {uploaded_file.name}: File is larger than 200MB") - return errors + @staticmethod + def is_utf8_compatible(uploaded_file: UploadedFile) -> bool: + try: + uploaded_file.open() + uploaded_file.read().decode("utf-8") + uploaded_file.seek(0) + except UnicodeDecodeError: + logger.info("File is incompatible. Converting...") + return False + else: + logger.info("File is compatible with utf-8 - ready for processing") + return True + + @staticmethod + def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile: + try: + uploaded_file.open() + content = uploaded_file.read().decode("ISO-8859-1") + + # Detect and replace non-UTF-8 characters + new_bytes = content.encode("utf-8") + + # Creating a new InMemoryUploadedFile object with the converted content + new_uploaded_file = InMemoryUploadedFile( + file=BytesIO(new_bytes), + field_name=uploaded_file.field_name, + name=uploaded_file.name, + content_type="application/octet-stream", + size=len(new_bytes), + charset="utf-8", + ) + except Exception as e: + logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e) + return uploaded_file + else: + logger.info("Conversion to UTF-8 successful") + return new_uploaded_file + @staticmethod def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]: try: