diff --git a/django_app/redbox_app/redbox_core/views/document_views.py b/django_app/redbox_app/redbox_core/views/document_views.py index 5dafde0b..ae18c267 100644 --- a/django_app/redbox_app/redbox_core/views/document_views.py +++ b/django_app/redbox_app/redbox_core/views/document_views.py @@ -1,4 +1,5 @@ import logging +import re import uuid from collections.abc import MutableSequence, Sequence from pathlib import Path @@ -6,7 +7,7 @@ from django.contrib.auth import get_user_model from django.contrib.auth.decorators import login_required from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError -from django.core.files.uploadedfile import UploadedFile +from django.core.files.uploadedfile import UploadedFile, InMemoryUploadedFile from django.http import HttpRequest, HttpResponse, JsonResponse from django.shortcuts import get_object_or_404, redirect, render from django.urls import reverse @@ -14,6 +15,7 @@ from django.views import View from django.views.decorators.http import require_http_methods from django_q.tasks import async_task +from io import BytesIO from redbox_app.redbox_core.models import File from redbox_app.worker import ingest @@ -84,6 +86,8 @@ def post(self, request: HttpRequest) -> HttpResponse: for uploaded_file in uploaded_files: errors += self.validate_uploaded_file(uploaded_file) + if not self.is_utf8_compatible(uploaded_file): + uploaded_file = self.convert_to_utf8(uploaded_file) if not errors: for uploaded_file in uploaded_files: @@ -124,6 +128,42 @@ def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]: return errors + @staticmethod + def is_utf8_compatible(uploaded_file: UploadedFile) -> bool: + try: + uploaded_file.open() + uploaded_file.read().decode('utf-8') + uploaded_file.seek(0) + logger.info("File is compatible with utf-8 - ready for processing") + return True + except UnicodeDecodeError: + logger.info("File is incompatible. Converting...") + return False + + @staticmethod + def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile: + try: + uploaded_file.open() + content = uploaded_file.read().decode('ISO-8859-1') + + # Detect and replace non-UTF-8 characters + utf8_content = re.sub(r'[^\x00-\x7F]+', lambda x: x.group(0).encode('utf-8').decode('utf-8'), content) + logger.info(f"The content {utf8_content}") + # Creating a new InMemoryUploadedFile object with the converted content + new_uploaded_file = InMemoryUploadedFile( + file=BytesIO(utf8_content.encode('utf-8')), + field_name=uploaded_file.field_name, + name=uploaded_file.name, + content_type='application/octet-stream', + size=len(utf8_content), + charset='utf-8' + ) + logger.info("this worked") + return new_uploaded_file + except Exception as e: + logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e) + return uploaded_file + @staticmethod def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]: try: @@ -145,6 +185,7 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]: async_task(ingest, file.id, task_name=file.unique_name, group="ingest") + @login_required def remove_doc_view(request, doc_id: uuid): file = get_object_or_404(File, id=doc_id)