From 783dbb4d26c85aca5cca73a24f78da015c675132 Mon Sep 17 00:00:00 2001 From: Natasha Boyse Date: Thu, 27 Feb 2025 12:04:38 +0000 Subject: [PATCH] feat: Intercept incompatible (not UTF-8) text on upload and add functionality to convert doc to docx --- django_app/Dockerfile | 10 ++ .../redbox_core/views/document_views.py | 140 ++++++++++++++++-- 2 files changed, 137 insertions(+), 13 deletions(-) diff --git a/django_app/Dockerfile b/django_app/Dockerfile index d6842e85f..7d94fdab2 100644 --- a/django_app/Dockerfile +++ b/django_app/Dockerfile @@ -35,6 +35,13 @@ RUN poetry bundle venv ./venv # app FROM python:3.12-slim +RUN apt-get update && apt-get install -y \ + libreoffice \ + libreoffice-writer \ + unoconv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + RUN apt-get update && apt-get install --yes libpq-dev curl > /dev/null WORKDIR /usr/src/app @@ -55,4 +62,7 @@ EXPOSE 8090 RUN chmod +x start.sh RUN chmod +x health.sh + +RUN libreoffice --version + CMD ["./start.sh"] diff --git a/django_app/redbox_app/redbox_core/views/document_views.py b/django_app/redbox_app/redbox_core/views/document_views.py index 5dafde0b4..008c382fd 100644 --- a/django_app/redbox_app/redbox_core/views/document_views.py +++ b/django_app/redbox_app/redbox_core/views/document_views.py @@ -1,12 +1,16 @@ import logging +import subprocess +import tempfile +import time import uuid from collections.abc import MutableSequence, Sequence +from io import BytesIO from pathlib import Path from django.contrib.auth import get_user_model from django.contrib.auth.decorators import login_required -from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError -from django.core.files.uploadedfile import UploadedFile +from django.core.exceptions import FieldError, ValidationError +from django.core.files.uploadedfile import InMemoryUploadedFile, UploadedFile from django.http import HttpRequest, HttpResponse, JsonResponse from django.shortcuts import get_object_or_404, redirect, render from django.urls import reverse @@ -82,8 +86,14 @@ def post(self, request: HttpRequest) -> HttpResponse: if not uploaded_files: errors.append("No document selected") - for uploaded_file in uploaded_files: + for index, uploaded_file in enumerate(uploaded_files): errors += self.validate_uploaded_file(uploaded_file) + # handling doc -> docx conversion + if self.is_doc_file(uploaded_file): + uploaded_files[index] = self.convert_doc_to_docx(uploaded_file) + # handling utf8 compatibility + elif not self.is_utf8_compatible(uploaded_file): + uploaded_files[index] = self.convert_to_utf8(uploaded_file) if not errors: for uploaded_file in uploaded_files: @@ -108,22 +118,127 @@ def build_response(request: HttpRequest, errors: Sequence[str] | None = None) -> @staticmethod def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]: errors: MutableSequence[str] = [] - if not uploaded_file.name: errors.append("File has no name") else: file_extension = Path(uploaded_file.name).suffix if file_extension.lower() not in APPROVED_FILE_EXTENSIONS: errors.append(f"Error with {uploaded_file.name}: File type {file_extension} not supported") - if not uploaded_file.content_type: errors.append(f"Error with {uploaded_file.name}: File has no content-type") - if uploaded_file.size > MAX_FILE_SIZE: errors.append(f"Error with {uploaded_file.name}: File is larger than 200MB") - return errors + @staticmethod + def is_utf8_compatible(uploaded_file: UploadedFile) -> bool: + try: + uploaded_file.open() + uploaded_file.read().decode("utf-8") + uploaded_file.seek(0) + except UnicodeDecodeError: + logger.info("File is incompatible with utf-8. Converting...") + return False + else: + logger.info("File is compatible with utf-8 - ready for processing") + return True + + @staticmethod + def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile: + try: + uploaded_file.open() + content = uploaded_file.read().decode("ISO-8859-1") + + # Detect and replace non-UTF-8 characters + new_bytes = content.encode("utf-8") + + # Creating a new InMemoryUploadedFile object with the converted content + new_uploaded_file = InMemoryUploadedFile( + file=BytesIO(new_bytes), + field_name=uploaded_file.field_name, + name=uploaded_file.name, + content_type="application/octet-stream", + size=len(new_bytes), + charset="utf-8", + ) + except Exception as e: + logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e) + return uploaded_file + else: + logger.info("Conversion to UTF-8 successful") + return new_uploaded_file + + @staticmethod + def is_doc_file(uploaded_file: UploadedFile) -> bool: + return Path(uploaded_file.name).suffix.lower() == ".doc" + + @staticmethod + def convert_doc_to_docx(uploaded_file: UploadedFile) -> UploadedFile: + with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_input: + tmp_input.write(uploaded_file.read()) + tmp_input.flush() + input_path = Path(tmp_input.name) + output_dir = input_path.parent + + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + temp_output_path = input_path.with_suffix(".docx") + + try: + result = subprocess.run( # noqa: S603 + [ + "/usr/bin/libreoffice", + "--headless", + "--convert-to", + "docx", + str(input_path), + "--outdir", + str(output_dir), + ], + check=True, + capture_output=True, + cwd=output_dir, + ) + logger.info("LibreOffice output: %s", result.stdout.decode()) + logger.info("LibreOffice errors: %s", result.stderr.decode()) + + if not temp_output_path.exists(): + logger.error("Output file not found: %s", temp_output_path) + return uploaded_file + + logger.info("Output path: %s", temp_output_path) + + time.sleep(1) + with temp_output_path.open("rb") as f: + converted_content = f.read() + logger.info("Converted file size: %d bytes", len(converted_content)) + if len(converted_content) == 0: + logger.error("Converted file is empty - this won't get converted") + + output_filename = Path(uploaded_file.name).with_suffix(".docx").name + new_file = InMemoryUploadedFile( + file=BytesIO(converted_content), + field_name=uploaded_file.field_name, + name=output_filename, + content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + size=len(converted_content), + charset="utf-8", + ) + logger.info("doc file conversion to docx successful for %s", uploaded_file.name) + except Exception as e: + logger.exception("Error converting doc file %s to docx", uploaded_file.name, exc_info=e) + new_file = uploaded_file + finally: + try: + input_path.unlink() + if temp_output_path.exists(): + temp_output_path.unlink() + except Exception as cleanup_error: # noqa: BLE001 + logger.warning("Error cleaning up temporary files: %s", cleanup_error) + + return new_file + @staticmethod def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]: try: @@ -135,14 +250,13 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]: ) except (ValueError, FieldError, ValidationError) as e: logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e) - return e.args - except SuspiciousFileOperation: - return [ - f"Your file name is {len(uploaded_file.name)} characters long. " - f"The file name will need to be shortened by {len(uploaded_file.name) - 75} characters" - ] + return list(e.args) + except Exception as e: + logger.exception("Unexpected error processing %s.", uploaded_file, exc_info=e) + return [str(e)] else: async_task(ingest, file.id, task_name=file.unique_name, group="ingest") + return [] @login_required