Skip to content

Commit

Permalink
add logic to handle .doc errors
Browse files Browse the repository at this point in the history
  • Loading branch information
nboyse committed Feb 28, 2025
1 parent e1e8ba9 commit 0a98411
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 8 deletions.
10 changes: 10 additions & 0 deletions django_app/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ RUN poetry bundle venv ./venv
# app
FROM python:3.12-slim

RUN apt-get update && apt-get install -y \
libreoffice \
libreoffice-writer \
unoconv \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install --yes libpq-dev curl > /dev/null

WORKDIR /usr/src/app
Expand All @@ -55,4 +62,7 @@ EXPOSE 8090

RUN chmod +x start.sh
RUN chmod +x health.sh

RUN libreoffice --version

CMD ["./start.sh"]
88 changes: 80 additions & 8 deletions django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import logging
import uuid
import os
import subprocess
import tempfile
import time
from collections.abc import MutableSequence, Sequence
from io import BytesIO
from pathlib import Path
Expand Down Expand Up @@ -85,7 +89,11 @@ def post(self, request: HttpRequest) -> HttpResponse:

for index, uploaded_file in enumerate(uploaded_files):
errors += self.validate_uploaded_file(uploaded_file)
if not self.is_utf8_compatible(uploaded_file):
# handling doc -> docx conversion
if self.is_doc_file(uploaded_file):
uploaded_files[index] = self.convert_doc_to_docx(uploaded_file)
# handling utf8 compatibility
elif not self.is_utf8_compatible(uploaded_file):
uploaded_files[index] = self.convert_to_utf8(uploaded_file)

if not errors:
Expand Down Expand Up @@ -130,7 +138,7 @@ def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
uploaded_file.read().decode("utf-8")
uploaded_file.seek(0)
except UnicodeDecodeError:
logger.info("File is incompatible. Converting...")
logger.info("File is incompatible with utf-8. Converting...")
return False
else:
logger.info("File is compatible with utf-8 - ready for processing")
Expand Down Expand Up @@ -161,6 +169,71 @@ def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
logger.info("Conversion to UTF-8 successful")
return new_uploaded_file

@staticmethod
def is_doc_file(uploaded_file: UploadedFile) -> bool:
return Path(uploaded_file.name).suffix.lower() == ".doc"

@staticmethod
def convert_doc_to_docx(uploaded_file: UploadedFile) -> UploadedFile:
with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_input:
tmp_input.write(uploaded_file.read())
tmp_input.flush()
input_path = tmp_input.name
output_dir = os.path.dirname(input_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)

temp_output_path = str(Path(input_path).with_suffix(".docx"))

try:
result = subprocess.run(
["/usr/bin/libreoffice", "--headless", "--convert-to", "docx", input_path, "--outdir", output_dir],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=output_dir,
)
logger.info("LibreOffice output: %s", result.stdout.decode())
logger.info("LibreOffice errors: %s", result.stderr.decode())

if not os.path.exists(temp_output_path):
logger.error("Output file not found: %s", temp_output_path)
return uploaded_file

logger.info("Output path: %s", temp_output_path)

time.sleep(1)
with open(temp_output_path, "rb") as f:
converted_content = f.read()
logger.info("Converted file size: %d bytes", len(converted_content))
if len(converted_content) == 0:
logger.error("Converted file is empty - this won't get converted")

# Use the original file's name (with .docx) for the in-memory file
output_filename = Path(uploaded_file.name).with_suffix(".docx").name
new_file = InMemoryUploadedFile(
file=BytesIO(converted_content),
field_name=uploaded_file.field_name,
name=output_filename,
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
size=len(converted_content),
charset="utf-8",
)
logger.info("doc file conversion to docx successful for %s", uploaded_file.name)
except Exception as e:
logger.exception("Error converting doc file %s to docx: %s", uploaded_file.name, e, exc_info=e)
new_file = uploaded_file
finally:
try:
os.remove(input_path)
if os.path.exists(temp_output_path):
os.remove(temp_output_path)
except Exception as cleanup_error:
logger.warning("Error cleaning up temporary files: %s", cleanup_error)

return new_file


@staticmethod
def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
try:
Expand All @@ -172,14 +245,13 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
)
except (ValueError, FieldError, ValidationError) as e:
logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
return e.args
except SuspiciousFileOperation:
return [
f"Your file name is {len(uploaded_file.name)} characters long. "
f"The file name will need to be shortened by {len(uploaded_file.name) - 75} characters"
]
return list(e.args)
except Exception as e:
logger.exception("Unexpected error processing %s.", uploaded_file, exc_info=e)
return [str(e)]
else:
async_task(ingest, file.id, task_name=file.unique_name, group="ingest")
return []


@login_required
Expand Down

0 comments on commit 0a98411

Please sign in to comment.