add logic to handle .doc errors

uktrade · Feb 28, 2025 · 0a98411 · 0a98411
1 parent e1e8ba9
commit 0a98411
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 8 deletions.
diff --git a/django_app/Dockerfile b/django_app/Dockerfile
@@ -35,6 +35,13 @@ RUN poetry bundle venv ./venv
 # app
 FROM python:3.12-slim
 
+RUN apt-get update && apt-get install -y \
+    libreoffice \
+    libreoffice-writer \
+    unoconv \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN apt-get update && apt-get install --yes libpq-dev curl > /dev/null
 
 WORKDIR /usr/src/app
@@ -55,4 +62,7 @@ EXPOSE 8090
 
 RUN chmod +x start.sh
 RUN chmod +x health.sh
+
+RUN libreoffice --version
+
 CMD ["./start.sh"]
diff --git a/django_app/redbox_app/redbox_core/views/document_views.py b/django_app/redbox_app/redbox_core/views/document_views.py
@@ -1,5 +1,9 @@
 import logging
 import uuid
+import os
+import subprocess
+import tempfile
+import time
 from collections.abc import MutableSequence, Sequence
 from io import BytesIO
 from pathlib import Path
@@ -85,7 +89,11 @@ def post(self, request: HttpRequest) -> HttpResponse:
 
         for index, uploaded_file in enumerate(uploaded_files):
             errors += self.validate_uploaded_file(uploaded_file)
-            if not self.is_utf8_compatible(uploaded_file):
+            # handling doc -> docx conversion
+            if self.is_doc_file(uploaded_file):
+                uploaded_files[index] = self.convert_doc_to_docx(uploaded_file)
+            # handling utf8 compatibility
+            elif not self.is_utf8_compatible(uploaded_file):
                 uploaded_files[index] = self.convert_to_utf8(uploaded_file)
 
         if not errors:
@@ -130,7 +138,7 @@ def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
             uploaded_file.read().decode("utf-8")
             uploaded_file.seek(0)
         except UnicodeDecodeError:
-            logger.info("File is incompatible. Converting...")
+            logger.info("File is incompatible with utf-8. Converting...")
             return False
         else:
             logger.info("File is compatible with utf-8 - ready for processing")
@@ -161,6 +169,71 @@ def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
             logger.info("Conversion to UTF-8 successful")
             return new_uploaded_file
 
+    @staticmethod
+    def is_doc_file(uploaded_file: UploadedFile) -> bool:
+        return Path(uploaded_file.name).suffix.lower() == ".doc"
+
+    @staticmethod
+    def convert_doc_to_docx(uploaded_file: UploadedFile) -> UploadedFile:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_input:
+            tmp_input.write(uploaded_file.read())
+            tmp_input.flush()
+            input_path = tmp_input.name
+            output_dir = os.path.dirname(input_path)
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir, exist_ok=True)
+
+            temp_output_path = str(Path(input_path).with_suffix(".docx"))
+
+            try:
+                result = subprocess.run(
+                    ["/usr/bin/libreoffice", "--headless", "--convert-to", "docx", input_path, "--outdir", output_dir],
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    cwd=output_dir,
+                )
+                logger.info("LibreOffice output: %s", result.stdout.decode())
+                logger.info("LibreOffice errors: %s", result.stderr.decode())
+
+                if not os.path.exists(temp_output_path):
+                    logger.error("Output file not found: %s", temp_output_path)
+                    return uploaded_file
+
+                logger.info("Output path: %s", temp_output_path)
+
+                time.sleep(1)
+                with open(temp_output_path, "rb") as f:
+                    converted_content = f.read()
+                    logger.info("Converted file size: %d bytes", len(converted_content))
+                    if len(converted_content) == 0:
+                        logger.error("Converted file is empty - this won't get converted")
+
+                    # Use the original file's name (with .docx) for the in-memory file
+                    output_filename = Path(uploaded_file.name).with_suffix(".docx").name
+                    new_file = InMemoryUploadedFile(
+                        file=BytesIO(converted_content),
+                        field_name=uploaded_file.field_name,
+                        name=output_filename,
+                        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        size=len(converted_content),
+                        charset="utf-8",
+                    )
+                    logger.info("doc file conversion to docx successful for %s", uploaded_file.name)
+            except Exception as e:
+                logger.exception("Error converting doc file %s to docx: %s", uploaded_file.name, e, exc_info=e)
+                new_file = uploaded_file
+            finally:
+                try:
+                    os.remove(input_path)
+                    if os.path.exists(temp_output_path):
+                        os.remove(temp_output_path)
+                except Exception as cleanup_error:
+                    logger.warning("Error cleaning up temporary files: %s", cleanup_error)
+
+            return new_file
+
+
     @staticmethod
     def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
         try:
@@ -172,14 +245,13 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
             )
         except (ValueError, FieldError, ValidationError) as e:
             logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
-            return e.args
-        except SuspiciousFileOperation:
-            return [
-                f"Your file name is {len(uploaded_file.name)} characters long. "
-                f"The file name will need to be shortened by {len(uploaded_file.name) - 75} characters"
-            ]
+            return list(e.args)
+        except Exception as e:
+            logger.exception("Unexpected error processing %s.", uploaded_file, exc_info=e)
+            return [str(e)]
         else:
             async_task(ingest, file.id, task_name=file.unique_name, group="ingest")
+            return []
 
 
 @login_required