From 783dbb4d26c85aca5cca73a24f78da015c675132 Mon Sep 17 00:00:00 2001
From: Natasha Boyse <natasha.boyse@digital.trade.gov.uk>
Date: Thu, 27 Feb 2025 12:04:38 +0000
Subject: [PATCH] feat: Intercept incompatible (not UTF-8) text on upload and
 add functionality to convert doc to docx

---
 django_app/Dockerfile                         |  10 ++
 .../redbox_core/views/document_views.py       | 140 ++++++++++++++++--
 2 files changed, 137 insertions(+), 13 deletions(-)

diff --git a/django_app/Dockerfile b/django_app/Dockerfile
index d6842e85f..7d94fdab2 100644
--- a/django_app/Dockerfile
+++ b/django_app/Dockerfile
@@ -35,6 +35,13 @@ RUN poetry bundle venv ./venv
 # app
 FROM python:3.12-slim
 
+RUN apt-get update && apt-get install -y \
+    libreoffice \
+    libreoffice-writer \
+    unoconv \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN apt-get update && apt-get install --yes libpq-dev curl > /dev/null
 
 WORKDIR /usr/src/app
@@ -55,4 +62,7 @@ EXPOSE 8090
 
 RUN chmod +x start.sh
 RUN chmod +x health.sh
+
+RUN libreoffice --version
+
 CMD ["./start.sh"]
diff --git a/django_app/redbox_app/redbox_core/views/document_views.py b/django_app/redbox_app/redbox_core/views/document_views.py
index 5dafde0b4..008c382fd 100644
--- a/django_app/redbox_app/redbox_core/views/document_views.py
+++ b/django_app/redbox_app/redbox_core/views/document_views.py
@@ -1,12 +1,16 @@
 import logging
+import subprocess
+import tempfile
+import time
 import uuid
 from collections.abc import MutableSequence, Sequence
+from io import BytesIO
 from pathlib import Path
 
 from django.contrib.auth import get_user_model
 from django.contrib.auth.decorators import login_required
-from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError
-from django.core.files.uploadedfile import UploadedFile
+from django.core.exceptions import FieldError, ValidationError
+from django.core.files.uploadedfile import InMemoryUploadedFile, UploadedFile
 from django.http import HttpRequest, HttpResponse, JsonResponse
 from django.shortcuts import get_object_or_404, redirect, render
 from django.urls import reverse
@@ -82,8 +86,14 @@ def post(self, request: HttpRequest) -> HttpResponse:
         if not uploaded_files:
             errors.append("No document selected")
 
-        for uploaded_file in uploaded_files:
+        for index, uploaded_file in enumerate(uploaded_files):
             errors += self.validate_uploaded_file(uploaded_file)
+            # handling doc -> docx conversion
+            if self.is_doc_file(uploaded_file):
+                uploaded_files[index] = self.convert_doc_to_docx(uploaded_file)
+            # handling utf8 compatibility
+            elif not self.is_utf8_compatible(uploaded_file):
+                uploaded_files[index] = self.convert_to_utf8(uploaded_file)
 
         if not errors:
             for uploaded_file in uploaded_files:
@@ -108,22 +118,127 @@ def build_response(request: HttpRequest, errors: Sequence[str] | None = None) ->
     @staticmethod
     def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]:
         errors: MutableSequence[str] = []
-
         if not uploaded_file.name:
             errors.append("File has no name")
         else:
             file_extension = Path(uploaded_file.name).suffix
             if file_extension.lower() not in APPROVED_FILE_EXTENSIONS:
                 errors.append(f"Error with {uploaded_file.name}: File type {file_extension} not supported")
-
         if not uploaded_file.content_type:
             errors.append(f"Error with {uploaded_file.name}: File has no content-type")
-
         if uploaded_file.size > MAX_FILE_SIZE:
             errors.append(f"Error with {uploaded_file.name}: File is larger than 200MB")
-
         return errors
 
+    @staticmethod
+    def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
+        try:
+            uploaded_file.open()
+            uploaded_file.read().decode("utf-8")
+            uploaded_file.seek(0)
+        except UnicodeDecodeError:
+            logger.info("File is incompatible with utf-8. Converting...")
+            return False
+        else:
+            logger.info("File is compatible with utf-8 - ready for processing")
+            return True
+
+    @staticmethod
+    def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
+        try:
+            uploaded_file.open()
+            content = uploaded_file.read().decode("ISO-8859-1")
+
+            # Detect and replace non-UTF-8 characters
+            new_bytes = content.encode("utf-8")
+
+            # Creating a new InMemoryUploadedFile object with the converted content
+            new_uploaded_file = InMemoryUploadedFile(
+                file=BytesIO(new_bytes),
+                field_name=uploaded_file.field_name,
+                name=uploaded_file.name,
+                content_type="application/octet-stream",
+                size=len(new_bytes),
+                charset="utf-8",
+            )
+        except Exception as e:
+            logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e)
+            return uploaded_file
+        else:
+            logger.info("Conversion to UTF-8 successful")
+            return new_uploaded_file
+
+    @staticmethod
+    def is_doc_file(uploaded_file: UploadedFile) -> bool:
+        return Path(uploaded_file.name).suffix.lower() == ".doc"
+
+    @staticmethod
+    def convert_doc_to_docx(uploaded_file: UploadedFile) -> UploadedFile:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_input:
+            tmp_input.write(uploaded_file.read())
+            tmp_input.flush()
+            input_path = Path(tmp_input.name)
+            output_dir = input_path.parent
+
+            if not output_dir.exists():
+                output_dir.mkdir(parents=True, exist_ok=True)
+
+            temp_output_path = input_path.with_suffix(".docx")
+
+            try:
+                result = subprocess.run(  # noqa: S603
+                    [
+                        "/usr/bin/libreoffice",
+                        "--headless",
+                        "--convert-to",
+                        "docx",
+                        str(input_path),
+                        "--outdir",
+                        str(output_dir),
+                    ],
+                    check=True,
+                    capture_output=True,
+                    cwd=output_dir,
+                )
+                logger.info("LibreOffice output: %s", result.stdout.decode())
+                logger.info("LibreOffice errors: %s", result.stderr.decode())
+
+                if not temp_output_path.exists():
+                    logger.error("Output file not found: %s", temp_output_path)
+                    return uploaded_file
+
+                logger.info("Output path: %s", temp_output_path)
+
+                time.sleep(1)
+                with temp_output_path.open("rb") as f:
+                    converted_content = f.read()
+                    logger.info("Converted file size: %d bytes", len(converted_content))
+                    if len(converted_content) == 0:
+                        logger.error("Converted file is empty - this won't get converted")
+
+                    output_filename = Path(uploaded_file.name).with_suffix(".docx").name
+                    new_file = InMemoryUploadedFile(
+                        file=BytesIO(converted_content),
+                        field_name=uploaded_file.field_name,
+                        name=output_filename,
+                        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        size=len(converted_content),
+                        charset="utf-8",
+                    )
+                    logger.info("doc file conversion to docx successful for %s", uploaded_file.name)
+            except Exception as e:
+                logger.exception("Error converting doc file %s to docx", uploaded_file.name, exc_info=e)
+                new_file = uploaded_file
+            finally:
+                try:
+                    input_path.unlink()
+                    if temp_output_path.exists():
+                        temp_output_path.unlink()
+                except Exception as cleanup_error:  # noqa: BLE001
+                    logger.warning("Error cleaning up temporary files: %s", cleanup_error)
+
+            return new_file
+
     @staticmethod
     def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
         try:
@@ -135,14 +250,13 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
             )
         except (ValueError, FieldError, ValidationError) as e:
             logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
-            return e.args
-        except SuspiciousFileOperation:
-            return [
-                f"Your file name is {len(uploaded_file.name)} characters long. "
-                f"The file name will need to be shortened by {len(uploaded_file.name) - 75} characters"
-            ]
+            return list(e.args)
+        except Exception as e:
+            logger.exception("Unexpected error processing %s.", uploaded_file, exc_info=e)
+            return [str(e)]
         else:
             async_task(ingest, file.id, task_name=file.unique_name, group="ingest")
+            return []
 
 
 @login_required