Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nboyse committed Feb 27, 2025
1 parent 9d45501 commit 9eb44e6
Showing 1 changed file with 43 additions and 6 deletions.
49 changes: 43 additions & 6 deletions django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging
import uuid
from collections.abc import MutableSequence, Sequence
from io import BytesIO
from pathlib import Path

from django.contrib.auth import get_user_model
from django.contrib.auth.decorators import login_required
from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError
from django.core.files.uploadedfile import UploadedFile
from django.core.files.uploadedfile import InMemoryUploadedFile, UploadedFile
from django.http import HttpRequest, HttpResponse, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.urls import reverse
Expand Down Expand Up @@ -82,8 +83,10 @@ def post(self, request: HttpRequest) -> HttpResponse:
if not uploaded_files:
errors.append("No document selected")

for uploaded_file in uploaded_files:
for index, uploaded_file in enumerate(uploaded_files):
errors += self.validate_uploaded_file(uploaded_file)
if not self.is_utf8_compatible(uploaded_file):
uploaded_files[index] = self.convert_to_utf8(uploaded_file)

if not errors:
for uploaded_file in uploaded_files:
Expand All @@ -108,22 +111,56 @@ def build_response(request: HttpRequest, errors: Sequence[str] | None = None) ->
@staticmethod
def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]:
errors: MutableSequence[str] = []

if not uploaded_file.name:
errors.append("File has no name")
else:
file_extension = Path(uploaded_file.name).suffix
if file_extension.lower() not in APPROVED_FILE_EXTENSIONS:
errors.append(f"Error with {uploaded_file.name}: File type {file_extension} not supported")

if not uploaded_file.content_type:
errors.append(f"Error with {uploaded_file.name}: File has no content-type")

if uploaded_file.size > MAX_FILE_SIZE:
errors.append(f"Error with {uploaded_file.name}: File is larger than 200MB")

return errors

@staticmethod
def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
try:
uploaded_file.open()
uploaded_file.read().decode("utf-8")
uploaded_file.seek(0)
except UnicodeDecodeError:
logger.info("File is incompatible. Converting...")
return False
else:
logger.info("File is compatible with utf-8 - ready for processing")
return True

@staticmethod
def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
try:
uploaded_file.open()
content = uploaded_file.read().decode("ISO-8859-1")

# Detect and replace non-UTF-8 characters
new_bytes = content.encode("utf-8")

# Creating a new InMemoryUploadedFile object with the converted content
new_uploaded_file = InMemoryUploadedFile(
file=BytesIO(new_bytes),
field_name=uploaded_file.field_name,
name=uploaded_file.name,
content_type="application/octet-stream",
size=len(new_bytes),
charset="utf-8",
)
except Exception as e:
logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e)
return uploaded_file
else:
logger.info("Conversion to UTF-8 successful")
return new_uploaded_file

@staticmethod
def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
try:
Expand Down

0 comments on commit 9eb44e6

Please sign in to comment.