Skip to content

Commit

Permalink
feat: Intercept incompatible (not UTF-8) text on upload and add funct…
Browse files Browse the repository at this point in the history
…ionality to convert doc to docx
  • Loading branch information
nboyse committed Feb 28, 2025
1 parent 37095ce commit 783dbb4
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 13 deletions.
10 changes: 10 additions & 0 deletions django_app/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ RUN poetry bundle venv ./venv
# app
FROM python:3.12-slim

RUN apt-get update && apt-get install -y \
libreoffice \
libreoffice-writer \
unoconv \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install --yes libpq-dev curl > /dev/null

WORKDIR /usr/src/app
Expand All @@ -55,4 +62,7 @@ EXPOSE 8090

RUN chmod +x start.sh
RUN chmod +x health.sh

RUN libreoffice --version

CMD ["./start.sh"]
140 changes: 127 additions & 13 deletions django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
import subprocess
import tempfile
import time
import uuid
from collections.abc import MutableSequence, Sequence
from io import BytesIO
from pathlib import Path

from django.contrib.auth import get_user_model
from django.contrib.auth.decorators import login_required
from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError
from django.core.files.uploadedfile import UploadedFile
from django.core.exceptions import FieldError, ValidationError
from django.core.files.uploadedfile import InMemoryUploadedFile, UploadedFile
from django.http import HttpRequest, HttpResponse, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.urls import reverse
Expand Down Expand Up @@ -82,8 +86,14 @@ def post(self, request: HttpRequest) -> HttpResponse:
if not uploaded_files:
errors.append("No document selected")

for uploaded_file in uploaded_files:
for index, uploaded_file in enumerate(uploaded_files):
errors += self.validate_uploaded_file(uploaded_file)
# handling doc -> docx conversion
if self.is_doc_file(uploaded_file):
uploaded_files[index] = self.convert_doc_to_docx(uploaded_file)
# handling utf8 compatibility
elif not self.is_utf8_compatible(uploaded_file):
uploaded_files[index] = self.convert_to_utf8(uploaded_file)

if not errors:
for uploaded_file in uploaded_files:
Expand All @@ -108,22 +118,127 @@ def build_response(request: HttpRequest, errors: Sequence[str] | None = None) ->
@staticmethod
def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]:
errors: MutableSequence[str] = []

if not uploaded_file.name:
errors.append("File has no name")
else:
file_extension = Path(uploaded_file.name).suffix
if file_extension.lower() not in APPROVED_FILE_EXTENSIONS:
errors.append(f"Error with {uploaded_file.name}: File type {file_extension} not supported")

if not uploaded_file.content_type:
errors.append(f"Error with {uploaded_file.name}: File has no content-type")

if uploaded_file.size > MAX_FILE_SIZE:
errors.append(f"Error with {uploaded_file.name}: File is larger than 200MB")

return errors

@staticmethod
def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
try:
uploaded_file.open()
uploaded_file.read().decode("utf-8")
uploaded_file.seek(0)
except UnicodeDecodeError:
logger.info("File is incompatible with utf-8. Converting...")
return False
else:
logger.info("File is compatible with utf-8 - ready for processing")
return True

@staticmethod
def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
try:
uploaded_file.open()
content = uploaded_file.read().decode("ISO-8859-1")

# Detect and replace non-UTF-8 characters
new_bytes = content.encode("utf-8")

# Creating a new InMemoryUploadedFile object with the converted content
new_uploaded_file = InMemoryUploadedFile(
file=BytesIO(new_bytes),
field_name=uploaded_file.field_name,
name=uploaded_file.name,
content_type="application/octet-stream",
size=len(new_bytes),
charset="utf-8",
)
except Exception as e:
logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e)
return uploaded_file
else:
logger.info("Conversion to UTF-8 successful")
return new_uploaded_file

@staticmethod
def is_doc_file(uploaded_file: UploadedFile) -> bool:
return Path(uploaded_file.name).suffix.lower() == ".doc"

@staticmethod
def convert_doc_to_docx(uploaded_file: UploadedFile) -> UploadedFile:
with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_input:
tmp_input.write(uploaded_file.read())
tmp_input.flush()
input_path = Path(tmp_input.name)
output_dir = input_path.parent

if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

temp_output_path = input_path.with_suffix(".docx")

try:
result = subprocess.run( # noqa: S603
[
"/usr/bin/libreoffice",
"--headless",
"--convert-to",
"docx",
str(input_path),
"--outdir",
str(output_dir),
],
check=True,
capture_output=True,
cwd=output_dir,
)
logger.info("LibreOffice output: %s", result.stdout.decode())
logger.info("LibreOffice errors: %s", result.stderr.decode())

if not temp_output_path.exists():
logger.error("Output file not found: %s", temp_output_path)
return uploaded_file

logger.info("Output path: %s", temp_output_path)

time.sleep(1)
with temp_output_path.open("rb") as f:
converted_content = f.read()
logger.info("Converted file size: %d bytes", len(converted_content))
if len(converted_content) == 0:
logger.error("Converted file is empty - this won't get converted")

output_filename = Path(uploaded_file.name).with_suffix(".docx").name
new_file = InMemoryUploadedFile(
file=BytesIO(converted_content),
field_name=uploaded_file.field_name,
name=output_filename,
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
size=len(converted_content),
charset="utf-8",
)
logger.info("doc file conversion to docx successful for %s", uploaded_file.name)
except Exception as e:
logger.exception("Error converting doc file %s to docx", uploaded_file.name, exc_info=e)
new_file = uploaded_file
finally:
try:
input_path.unlink()
if temp_output_path.exists():
temp_output_path.unlink()
except Exception as cleanup_error: # noqa: BLE001
logger.warning("Error cleaning up temporary files: %s", cleanup_error)

return new_file

@staticmethod
def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
try:
Expand All @@ -135,14 +250,13 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
)
except (ValueError, FieldError, ValidationError) as e:
logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
return e.args
except SuspiciousFileOperation:
return [
f"Your file name is {len(uploaded_file.name)} characters long. "
f"The file name will need to be shortened by {len(uploaded_file.name) - 75} characters"
]
return list(e.args)
except Exception as e:
logger.exception("Unexpected error processing %s.", uploaded_file, exc_info=e)
return [str(e)]
else:
async_task(ingest, file.id, task_name=file.unique_name, group="ingest")
return []


@login_required
Expand Down

0 comments on commit 783dbb4

Please sign in to comment.