Skip to content

Commit

Permalink
refactor(API): Imports: nouvelle librairie de gestion des imports (qu…
Browse files Browse the repository at this point in the history
…i check le format, la taille max, et si déjà uploadé) (#4936)
  • Loading branch information
raphodn authored Jan 29, 2025
1 parent dc04f52 commit 7fa3b00
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 55 deletions.
14 changes: 5 additions & 9 deletions api/views/diagnostic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)
from api.serializers import DiagnosticAndCanteenSerializer, ManagerDiagnosticSerializer
from api.views.utils import update_change_reason_with_auth
from common.utils import send_mail
from common.utils import file_import, send_mail
from data.models import Canteen, Teledeclaration
from data.models.diagnostic import Diagnostic

Expand Down Expand Up @@ -98,8 +98,9 @@ class EmailDiagnosticImportFileView(APIView):

def post(self, request):
try:
file = request.data["file"]
self._verify_file_size(file)
self.file = request.data["file"]
file_import.validate_file_size(self.file)
file_import.validate_file_format(self.file)
email = request.data.get("email", request.user.email).strip()
context = {
"from": email,
Expand All @@ -111,7 +112,7 @@ def post(self, request):
to=[settings.CONTACT_EMAIL],
reply_to=[email],
template="unusual_diagnostic_import_file",
attachments=[(file.name, file.read(), file.content_type)],
attachments=[(self.file.name, self.file.read(), self.file.content_type)],
context=context,
)
except ValidationError as e:
Expand All @@ -127,11 +128,6 @@ def post(self, request):

return HttpResponse()

@staticmethod
def _verify_file_size(file):
if file.size > settings.CSV_IMPORT_MAX_SIZE:
raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")


class DiagnosticsToTeledeclarePagination(LimitOffsetPagination):
default_limit = 100
Expand Down
22 changes: 5 additions & 17 deletions api/views/diagnosticimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from decimal import Decimal, InvalidOperation

import requests
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.exceptions import ValidationError
from django.core.validators import validate_email
Expand All @@ -21,13 +20,14 @@

from api.permissions import IsAuthenticated
from api.serializers import FullCanteenSerializer
from common.utils import file_import
from common.utils.siret import normalise_siret
from data.models import Canteen, ImportFailure, ImportType, Sector
from data.models.diagnostic import Diagnostic
from data.models.teledeclaration import Teledeclaration

from .canteen import AddManagerView
from .utils import camelize, decode_bytes
from .utils import camelize

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -81,8 +81,8 @@ def post(self, request):
try:
with transaction.atomic():
self.file = request.data["file"]
ImportDiagnosticsView._verify_file_format(self.file)
ImportDiagnosticsView._verify_file_size(self.file)
file_import.validate_file_size(self.file)
file_import.validate_file_format(self.file)
self._process_file(self.file)

if self.errors:
Expand Down Expand Up @@ -114,18 +114,6 @@ def _log_error(self, message, level="warning"):
import_type=self.import_type,
)

@staticmethod
def _verify_file_format(file):
if file.content_type != "text/csv" and file.content_type != "text/tab-separated-values":
raise ValidationError(
f"Ce fichier est au format {file.content_type}, merci d'exporter votre fichier au format CSV et réessayer."
)

@staticmethod
def _verify_file_size(file):
if file.size > settings.CSV_IMPORT_MAX_SIZE:
raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")

def check_admin_values(self, header):
is_admin_import = any("admin_" in column for column in header)
if is_admin_import and not self.request.user.is_staff:
Expand Down Expand Up @@ -174,7 +162,7 @@ def _process_file(self, file):
self._update_location_data(locations_csv_str)

def _decode_file(self, file):
(result, encoding) = decode_bytes(file.read())
(result, encoding) = file_import.decode_bytes(file.read())
self.encoding_detected = encoding
return result

Expand Down
33 changes: 12 additions & 21 deletions api/views/purchaseimport.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import csv
import hashlib
import io
import json
import logging
Expand All @@ -17,11 +16,11 @@

from api.permissions import IsAuthenticated
from api.serializers import PurchaseSerializer
from common.utils import file_import
from common.utils.siret import normalise_siret
from data.models import Canteen, ImportFailure, ImportType, Purchase

from .diagnosticimport import ImportDiagnosticsView
from .utils import camelize, decode_bytes
from .utils import camelize

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,8 +50,14 @@ def post(self, request):
logger.info("Purchase bulk import started")
try:
self.file = request.data["file"]
self._verify_file_size()
ImportDiagnosticsView._verify_file_format(self.file)
file_import.validate_file_size(self.file)
file_import.validate_file_format(self.file)

self.file_digest = file_import.get_file_digest(self.file)
self._check_duplication()

self.dialect = file_import.get_csv_file_dialect(self.file)

with transaction.atomic():
self._process_file()

Expand All @@ -61,10 +66,6 @@ def post(self, request):
if self.errors:
raise IntegrityError()

# The duplication check is called after the processing. The cost of eventually processing
# the file for nothing appears to be smaller than read the file twice.
self._check_duplication()

# Update all purchases's import source with file digest
Purchase.objects.filter(import_source=self.tmp_id).update(import_source=self.file_digest)

Expand Down Expand Up @@ -101,18 +102,14 @@ def _log_error(self, message, level="warning"):
)

def _process_file(self):
file_hash = hashlib.md5()
chunk = []
read_header = True
row_count = 1
for row in self.file:
file_hash.update(row)

# Sniffing 1st line
if read_header:
# decode header, discarding encoding result that might not be accurate without more data
(decoded_row, _) = decode_bytes(row)
self.dialect = csv.Sniffer().sniff(decoded_row)
(decoded_row, _) = file_import.decode_bytes(row)
csvreader = csv.reader(io.StringIO("".join(decoded_row)), self.dialect)
for header in csvreader:
if header != self.expected_header:
Expand All @@ -134,16 +131,10 @@ def _process_file(self):
if len(chunk) > 0:
self._process_chunk(chunk)

self.file_digest = file_hash.hexdigest()

def _verify_file_size(self):
if self.file.size > settings.CSV_IMPORT_MAX_SIZE:
raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")

def _decode_chunk(self, chunk_list):
if self.encoding_detected is None:
chunk = b"".join(chunk_list)
(_, encoding) = decode_bytes(chunk)
(_, encoding) = file_import.decode_bytes(chunk)
self.encoding_detected = encoding
return [chunk.decode(self.encoding_detected) for chunk in chunk_list]

Expand Down
8 changes: 0 additions & 8 deletions api/views/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import logging

import chardet
from djangorestframework_camel_case.render import CamelCaseJSONRenderer
from simple_history.utils import update_change_reason

Expand All @@ -21,10 +20,3 @@ def update_change_reason_with_auth(view, object):
except Exception as e:
logger.warning(f"Unable to set reason change on {view.__class__.__name__} for object ID : {object.id}: \n{e}")
update_change_reason(object, "Unknown")


def decode_bytes(bytes_string):
detection_result = chardet.detect(bytes_string)
encoding_detected = detection_result["encoding"]
logger.info(f"Encoding autodetected : {encoding_detected}")
return (bytes_string.decode(encoding_detected), encoding_detected)
56 changes: 56 additions & 0 deletions common/utils/file_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import csv
import hashlib
import logging

import chardet
from django.conf import settings
from django.core.exceptions import ValidationError

logger = logging.getLogger(__name__)


def decode_bytes(bytes_string):
detection_result = chardet.detect(bytes_string)
encoding_detected = detection_result["encoding"]
logger.info(f"Encoding autodetected : {encoding_detected}")
return (bytes_string.decode(encoding_detected), encoding_detected)


def get_file_size(file):
return file.size


def validate_file_size(file):
if get_file_size(file) > settings.CSV_IMPORT_MAX_SIZE:
raise ValidationError(
f"Ce fichier est trop grand, merci d'utiliser un fichier de moins de {settings.CSV_IMPORT_MAX_SIZE_PRETTY}"
)


def get_file_content_type(file):
return file.content_type


def validate_file_format(file):
file_format = get_file_content_type(file)
if file_format not in ["text/csv", "text/tab-separated-values"]:
raise ValidationError(
f"Ce fichier est au format {file_format}, merci d'exporter votre fichier au format CSV et réessayer."
)


def get_file_digest(file):
file_hash = hashlib.md5()
for row in file:
file_hash.update(row)
return file_hash.hexdigest()


def get_csv_file_dialect(file):
"""
Possible values: 'excel', 'excel-tab', 'unix'
"""
file.seek(0)
row_1 = file.readline()
(decoded_row, _) = decode_bytes(row_1)
return csv.Sniffer().sniff(decoded_row)
1 change: 1 addition & 0 deletions macantine/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@

# Maximum CSV import file size: 10Mo
CSV_IMPORT_MAX_SIZE = 10485760
CSV_IMPORT_MAX_SIZE_PRETTY = "10Mo"

# Size of each chunk when processing files
CSV_PURCHASE_CHUNK_LINES = 10000
Expand Down

0 comments on commit 7fa3b00

Please sign in to comment.