From 7fa3b00d75c4877a2b98dbdb58704974fc9caad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Odini?= Date: Wed, 29 Jan 2025 11:17:17 +0100 Subject: [PATCH] =?UTF-8?q?refactor(API):=20Imports:=20nouvelle=20librairi?= =?UTF-8?q?e=20de=20gestion=20des=20imports=20(qui=20check=20le=20format,?= =?UTF-8?q?=20la=20taille=20max,=20et=20si=20d=C3=A9j=C3=A0=20upload=C3=A9?= =?UTF-8?q?)=20(#4936)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/views/diagnostic.py | 14 ++++----- api/views/diagnosticimport.py | 22 ++++---------- api/views/purchaseimport.py | 33 ++++++++------------- api/views/utils.py | 8 ----- common/utils/file_import.py | 56 +++++++++++++++++++++++++++++++++++ macantine/settings.py | 1 + 6 files changed, 79 insertions(+), 55 deletions(-) create mode 100644 common/utils/file_import.py diff --git a/api/views/diagnostic.py b/api/views/diagnostic.py index 425d6998fa..4ae9846b21 100644 --- a/api/views/diagnostic.py +++ b/api/views/diagnostic.py @@ -24,7 +24,7 @@ ) from api.serializers import DiagnosticAndCanteenSerializer, ManagerDiagnosticSerializer from api.views.utils import update_change_reason_with_auth -from common.utils import send_mail +from common.utils import file_import, send_mail from data.models import Canteen, Teledeclaration from data.models.diagnostic import Diagnostic @@ -98,8 +98,9 @@ class EmailDiagnosticImportFileView(APIView): def post(self, request): try: - file = request.data["file"] - self._verify_file_size(file) + self.file = request.data["file"] + file_import.validate_file_size(self.file) + file_import.validate_file_format(self.file) email = request.data.get("email", request.user.email).strip() context = { "from": email, @@ -111,7 +112,7 @@ def post(self, request): to=[settings.CONTACT_EMAIL], reply_to=[email], template="unusual_diagnostic_import_file", - attachments=[(file.name, file.read(), file.content_type)], + attachments=[(self.file.name, self.file.read(), self.file.content_type)], context=context, ) except ValidationError as e: @@ -127,11 +128,6 @@ def post(self, request): return HttpResponse() - @staticmethod - def _verify_file_size(file): - if file.size > settings.CSV_IMPORT_MAX_SIZE: - raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo") - class DiagnosticsToTeledeclarePagination(LimitOffsetPagination): default_limit = 100 diff --git a/api/views/diagnosticimport.py b/api/views/diagnosticimport.py index 42d85ea897..f988b4cc89 100644 --- a/api/views/diagnosticimport.py +++ b/api/views/diagnosticimport.py @@ -7,7 +7,6 @@ from decimal import Decimal, InvalidOperation import requests -from django.conf import settings from django.contrib.auth import get_user_model from django.core.exceptions import ValidationError from django.core.validators import validate_email @@ -21,13 +20,14 @@ from api.permissions import IsAuthenticated from api.serializers import FullCanteenSerializer +from common.utils import file_import from common.utils.siret import normalise_siret from data.models import Canteen, ImportFailure, ImportType, Sector from data.models.diagnostic import Diagnostic from data.models.teledeclaration import Teledeclaration from .canteen import AddManagerView -from .utils import camelize, decode_bytes +from .utils import camelize logger = logging.getLogger(__name__) @@ -81,8 +81,8 @@ def post(self, request): try: with transaction.atomic(): self.file = request.data["file"] - ImportDiagnosticsView._verify_file_format(self.file) - ImportDiagnosticsView._verify_file_size(self.file) + file_import.validate_file_size(self.file) + file_import.validate_file_format(self.file) self._process_file(self.file) if self.errors: @@ -114,18 +114,6 @@ def _log_error(self, message, level="warning"): import_type=self.import_type, ) - @staticmethod - def _verify_file_format(file): - if file.content_type != "text/csv" and file.content_type != "text/tab-separated-values": - raise ValidationError( - f"Ce fichier est au format {file.content_type}, merci d'exporter votre fichier au format CSV et réessayer." - ) - - @staticmethod - def _verify_file_size(file): - if file.size > settings.CSV_IMPORT_MAX_SIZE: - raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo") - def check_admin_values(self, header): is_admin_import = any("admin_" in column for column in header) if is_admin_import and not self.request.user.is_staff: @@ -174,7 +162,7 @@ def _process_file(self, file): self._update_location_data(locations_csv_str) def _decode_file(self, file): - (result, encoding) = decode_bytes(file.read()) + (result, encoding) = file_import.decode_bytes(file.read()) self.encoding_detected = encoding return result diff --git a/api/views/purchaseimport.py b/api/views/purchaseimport.py index fd61444dc2..081bbce422 100644 --- a/api/views/purchaseimport.py +++ b/api/views/purchaseimport.py @@ -1,5 +1,4 @@ import csv -import hashlib import io import json import logging @@ -17,11 +16,11 @@ from api.permissions import IsAuthenticated from api.serializers import PurchaseSerializer +from common.utils import file_import from common.utils.siret import normalise_siret from data.models import Canteen, ImportFailure, ImportType, Purchase -from .diagnosticimport import ImportDiagnosticsView -from .utils import camelize, decode_bytes +from .utils import camelize logger = logging.getLogger(__name__) @@ -51,8 +50,14 @@ def post(self, request): logger.info("Purchase bulk import started") try: self.file = request.data["file"] - self._verify_file_size() - ImportDiagnosticsView._verify_file_format(self.file) + file_import.validate_file_size(self.file) + file_import.validate_file_format(self.file) + + self.file_digest = file_import.get_file_digest(self.file) + self._check_duplication() + + self.dialect = file_import.get_csv_file_dialect(self.file) + with transaction.atomic(): self._process_file() @@ -61,10 +66,6 @@ def post(self, request): if self.errors: raise IntegrityError() - # The duplication check is called after the processing. The cost of eventually processing - # the file for nothing appears to be smaller than read the file twice. - self._check_duplication() - # Update all purchases's import source with file digest Purchase.objects.filter(import_source=self.tmp_id).update(import_source=self.file_digest) @@ -101,18 +102,14 @@ def _log_error(self, message, level="warning"): ) def _process_file(self): - file_hash = hashlib.md5() chunk = [] read_header = True row_count = 1 for row in self.file: - file_hash.update(row) - # Sniffing 1st line if read_header: # decode header, discarding encoding result that might not be accurate without more data - (decoded_row, _) = decode_bytes(row) - self.dialect = csv.Sniffer().sniff(decoded_row) + (decoded_row, _) = file_import.decode_bytes(row) csvreader = csv.reader(io.StringIO("".join(decoded_row)), self.dialect) for header in csvreader: if header != self.expected_header: @@ -134,16 +131,10 @@ def _process_file(self): if len(chunk) > 0: self._process_chunk(chunk) - self.file_digest = file_hash.hexdigest() - - def _verify_file_size(self): - if self.file.size > settings.CSV_IMPORT_MAX_SIZE: - raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo") - def _decode_chunk(self, chunk_list): if self.encoding_detected is None: chunk = b"".join(chunk_list) - (_, encoding) = decode_bytes(chunk) + (_, encoding) = file_import.decode_bytes(chunk) self.encoding_detected = encoding return [chunk.decode(self.encoding_detected) for chunk in chunk_list] diff --git a/api/views/utils.py b/api/views/utils.py index 6b7e529226..0ed9e5a085 100644 --- a/api/views/utils.py +++ b/api/views/utils.py @@ -1,7 +1,6 @@ import json import logging -import chardet from djangorestframework_camel_case.render import CamelCaseJSONRenderer from simple_history.utils import update_change_reason @@ -21,10 +20,3 @@ def update_change_reason_with_auth(view, object): except Exception as e: logger.warning(f"Unable to set reason change on {view.__class__.__name__} for object ID : {object.id}: \n{e}") update_change_reason(object, "Unknown") - - -def decode_bytes(bytes_string): - detection_result = chardet.detect(bytes_string) - encoding_detected = detection_result["encoding"] - logger.info(f"Encoding autodetected : {encoding_detected}") - return (bytes_string.decode(encoding_detected), encoding_detected) diff --git a/common/utils/file_import.py b/common/utils/file_import.py new file mode 100644 index 0000000000..85ab7df13b --- /dev/null +++ b/common/utils/file_import.py @@ -0,0 +1,56 @@ +import csv +import hashlib +import logging + +import chardet +from django.conf import settings +from django.core.exceptions import ValidationError + +logger = logging.getLogger(__name__) + + +def decode_bytes(bytes_string): + detection_result = chardet.detect(bytes_string) + encoding_detected = detection_result["encoding"] + logger.info(f"Encoding autodetected : {encoding_detected}") + return (bytes_string.decode(encoding_detected), encoding_detected) + + +def get_file_size(file): + return file.size + + +def validate_file_size(file): + if get_file_size(file) > settings.CSV_IMPORT_MAX_SIZE: + raise ValidationError( + f"Ce fichier est trop grand, merci d'utiliser un fichier de moins de {settings.CSV_IMPORT_MAX_SIZE_PRETTY}" + ) + + +def get_file_content_type(file): + return file.content_type + + +def validate_file_format(file): + file_format = get_file_content_type(file) + if file_format not in ["text/csv", "text/tab-separated-values"]: + raise ValidationError( + f"Ce fichier est au format {file_format}, merci d'exporter votre fichier au format CSV et réessayer." + ) + + +def get_file_digest(file): + file_hash = hashlib.md5() + for row in file: + file_hash.update(row) + return file_hash.hexdigest() + + +def get_csv_file_dialect(file): + """ + Possible values: 'excel', 'excel-tab', 'unix' + """ + file.seek(0) + row_1 = file.readline() + (decoded_row, _) = decode_bytes(row_1) + return csv.Sniffer().sniff(decoded_row) diff --git a/macantine/settings.py b/macantine/settings.py index 339d3d6102..f3fe75d6a3 100644 --- a/macantine/settings.py +++ b/macantine/settings.py @@ -420,6 +420,7 @@ # Maximum CSV import file size: 10Mo CSV_IMPORT_MAX_SIZE = 10485760 +CSV_IMPORT_MAX_SIZE_PRETTY = "10Mo" # Size of each chunk when processing files CSV_PURCHASE_CHUNK_LINES = 10000