Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nboyse committed Feb 27, 2025
1 parent 9d45501 commit fc8dd48
Showing 1 changed file with 42 additions and 1 deletion.
43 changes: 42 additions & 1 deletion django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import logging
import re
import uuid
from collections.abc import MutableSequence, Sequence
from pathlib import Path

from django.contrib.auth import get_user_model
from django.contrib.auth.decorators import login_required
from django.core.exceptions import FieldError, SuspiciousFileOperation, ValidationError
from django.core.files.uploadedfile import UploadedFile
from django.core.files.uploadedfile import UploadedFile, InMemoryUploadedFile
from django.http import HttpRequest, HttpResponse, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.urls import reverse
from django.utils.decorators import method_decorator
from django.views import View
from django.views.decorators.http import require_http_methods
from django_q.tasks import async_task
from io import BytesIO

from redbox_app.redbox_core.models import File
from redbox_app.worker import ingest
Expand Down Expand Up @@ -84,6 +86,8 @@ def post(self, request: HttpRequest) -> HttpResponse:

for uploaded_file in uploaded_files:
errors += self.validate_uploaded_file(uploaded_file)
if not self.is_utf8_compatible(uploaded_file):
uploaded_file = self.convert_to_utf8(uploaded_file)

if not errors:
for uploaded_file in uploaded_files:
Expand Down Expand Up @@ -124,6 +128,42 @@ def validate_uploaded_file(uploaded_file: UploadedFile) -> Sequence[str]:

return errors

@staticmethod
def is_utf8_compatible(uploaded_file: UploadedFile) -> bool:
try:
uploaded_file.open()
uploaded_file.read().decode('utf-8')
uploaded_file.seek(0)
logger.info("File is compatible with utf-8 - ready for processing")
return True
except UnicodeDecodeError:
logger.info("File is incompatible. Converting...")
return False

@staticmethod
def convert_to_utf8(uploaded_file: UploadedFile) -> UploadedFile:
try:
uploaded_file.open()
content = uploaded_file.read().decode('ISO-8859-1')

# Detect and replace non-UTF-8 characters
utf8_content = re.sub(r'[^\x00-\x7F]+', lambda x: x.group(0).encode('utf-8').decode('utf-8'), content)
logger.info(f"The content {utf8_content}")
# Creating a new InMemoryUploadedFile object with the converted content
new_uploaded_file = InMemoryUploadedFile(
file=BytesIO(utf8_content.encode('utf-8')),
field_name=uploaded_file.field_name,
name=uploaded_file.name,
content_type='application/octet-stream',
size=len(utf8_content),
charset='utf-8'
)
logger.info("this worked")
return new_uploaded_file
except Exception as e:
logger.exception("Error converting file %s to UTF-8.", uploaded_file, exc_info=e)
return uploaded_file

@staticmethod
def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
try:
Expand All @@ -145,6 +185,7 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
async_task(ingest, file.id, task_name=file.unique_name, group="ingest")



@login_required
def remove_doc_view(request, doc_id: uuid):
file = get_object_or_404(File, id=doc_id)
Expand Down

0 comments on commit fc8dd48

Please sign in to comment.