refactor(API): Imports: nouvelle librairie de gestion des imports (qu…

…i check le format, la taille max, et si déjà uploadé) (#4936)
betagouv · Jan 29, 2025 · 7fa3b00 · 7fa3b00
1 parent dc04f52
commit 7fa3b00
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 55 deletions.
diff --git a/api/views/diagnostic.py b/api/views/diagnostic.py
@@ -24,7 +24,7 @@
 )
 from api.serializers import DiagnosticAndCanteenSerializer, ManagerDiagnosticSerializer
 from api.views.utils import update_change_reason_with_auth
-from common.utils import send_mail
+from common.utils import file_import, send_mail
 from data.models import Canteen, Teledeclaration
 from data.models.diagnostic import Diagnostic
 
@@ -98,8 +98,9 @@ class EmailDiagnosticImportFileView(APIView):
 
     def post(self, request):
         try:
-            file = request.data["file"]
-            self._verify_file_size(file)
+            self.file = request.data["file"]
+            file_import.validate_file_size(self.file)
+            file_import.validate_file_format(self.file)
             email = request.data.get("email", request.user.email).strip()
             context = {
                 "from": email,
@@ -111,7 +112,7 @@ def post(self, request):
                 to=[settings.CONTACT_EMAIL],
                 reply_to=[email],
                 template="unusual_diagnostic_import_file",
-                attachments=[(file.name, file.read(), file.content_type)],
+                attachments=[(self.file.name, self.file.read(), self.file.content_type)],
                 context=context,
             )
         except ValidationError as e:
@@ -127,11 +128,6 @@ def post(self, request):
 
         return HttpResponse()
 
-    @staticmethod
-    def _verify_file_size(file):
-        if file.size > settings.CSV_IMPORT_MAX_SIZE:
-            raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")
-
 
 class DiagnosticsToTeledeclarePagination(LimitOffsetPagination):
     default_limit = 100

diff --git a/api/views/diagnosticimport.py b/api/views/diagnosticimport.py
@@ -7,7 +7,6 @@
 from decimal import Decimal, InvalidOperation
 
 import requests
-from django.conf import settings
 from django.contrib.auth import get_user_model
 from django.core.exceptions import ValidationError
 from django.core.validators import validate_email
@@ -21,13 +20,14 @@
 
 from api.permissions import IsAuthenticated
 from api.serializers import FullCanteenSerializer
+from common.utils import file_import
 from common.utils.siret import normalise_siret
 from data.models import Canteen, ImportFailure, ImportType, Sector
 from data.models.diagnostic import Diagnostic
 from data.models.teledeclaration import Teledeclaration
 
 from .canteen import AddManagerView
-from .utils import camelize, decode_bytes
+from .utils import camelize
 
 logger = logging.getLogger(__name__)
 
@@ -81,8 +81,8 @@ def post(self, request):
         try:
             with transaction.atomic():
                 self.file = request.data["file"]
-                ImportDiagnosticsView._verify_file_format(self.file)
-                ImportDiagnosticsView._verify_file_size(self.file)
+                file_import.validate_file_size(self.file)
+                file_import.validate_file_format(self.file)
                 self._process_file(self.file)
 
                 if self.errors:
@@ -114,18 +114,6 @@ def _log_error(self, message, level="warning"):
             import_type=self.import_type,
         )
 
-    @staticmethod
-    def _verify_file_format(file):
-        if file.content_type != "text/csv" and file.content_type != "text/tab-separated-values":
-            raise ValidationError(
-                f"Ce fichier est au format {file.content_type}, merci d'exporter votre fichier au format CSV et réessayer."
-            )
-
-    @staticmethod
-    def _verify_file_size(file):
-        if file.size > settings.CSV_IMPORT_MAX_SIZE:
-            raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")
-
     def check_admin_values(self, header):
         is_admin_import = any("admin_" in column for column in header)
         if is_admin_import and not self.request.user.is_staff:
@@ -174,7 +162,7 @@ def _process_file(self, file):
             self._update_location_data(locations_csv_str)
 
     def _decode_file(self, file):
-        (result, encoding) = decode_bytes(file.read())
+        (result, encoding) = file_import.decode_bytes(file.read())
         self.encoding_detected = encoding
         return result
 

diff --git a/api/views/purchaseimport.py b/api/views/purchaseimport.py
@@ -1,5 +1,4 @@
 import csv
-import hashlib
 import io
 import json
 import logging
@@ -17,11 +16,11 @@
 
 from api.permissions import IsAuthenticated
 from api.serializers import PurchaseSerializer
+from common.utils import file_import
 from common.utils.siret import normalise_siret
 from data.models import Canteen, ImportFailure, ImportType, Purchase
 
-from .diagnosticimport import ImportDiagnosticsView
-from .utils import camelize, decode_bytes
+from .utils import camelize
 
 logger = logging.getLogger(__name__)
 
@@ -51,8 +50,14 @@ def post(self, request):
         logger.info("Purchase bulk import started")
         try:
             self.file = request.data["file"]
-            self._verify_file_size()
-            ImportDiagnosticsView._verify_file_format(self.file)
+            file_import.validate_file_size(self.file)
+            file_import.validate_file_format(self.file)
+
+            self.file_digest = file_import.get_file_digest(self.file)
+            self._check_duplication()
+
+            self.dialect = file_import.get_csv_file_dialect(self.file)
+
             with transaction.atomic():
                 self._process_file()
 
@@ -61,10 +66,6 @@ def post(self, request):
                 if self.errors:
                     raise IntegrityError()
 
-                # The duplication check is called after the processing. The cost of eventually processing
-                # the file for nothing appears to be smaller than read the file twice.
-                self._check_duplication()
-
                 # Update all purchases's import source with file digest
                 Purchase.objects.filter(import_source=self.tmp_id).update(import_source=self.file_digest)
 
@@ -101,18 +102,14 @@ def _log_error(self, message, level="warning"):
         )
 
     def _process_file(self):
-        file_hash = hashlib.md5()
         chunk = []
         read_header = True
         row_count = 1
         for row in self.file:
-            file_hash.update(row)
-
             # Sniffing 1st line
             if read_header:
                 # decode header, discarding encoding result that might not be accurate without more data
-                (decoded_row, _) = decode_bytes(row)
-                self.dialect = csv.Sniffer().sniff(decoded_row)
+                (decoded_row, _) = file_import.decode_bytes(row)
                 csvreader = csv.reader(io.StringIO("".join(decoded_row)), self.dialect)
                 for header in csvreader:
                     if header != self.expected_header:
@@ -134,16 +131,10 @@ def _process_file(self):
         if len(chunk) > 0:
             self._process_chunk(chunk)
 
-        self.file_digest = file_hash.hexdigest()
-
-    def _verify_file_size(self):
-        if self.file.size > settings.CSV_IMPORT_MAX_SIZE:
-            raise ValidationError("Ce fichier est trop grand, merci d'utiliser un fichier de moins de 10Mo")
-
     def _decode_chunk(self, chunk_list):
         if self.encoding_detected is None:
             chunk = b"".join(chunk_list)
-            (_, encoding) = decode_bytes(chunk)
+            (_, encoding) = file_import.decode_bytes(chunk)
             self.encoding_detected = encoding
         return [chunk.decode(self.encoding_detected) for chunk in chunk_list]
 

diff --git a/api/views/utils.py b/api/views/utils.py
@@ -1,7 +1,6 @@
 import json
 import logging
 
-import chardet
 from djangorestframework_camel_case.render import CamelCaseJSONRenderer
 from simple_history.utils import update_change_reason
 
@@ -21,10 +20,3 @@ def update_change_reason_with_auth(view, object):
     except Exception as e:
         logger.warning(f"Unable to set reason change on {view.__class__.__name__} for object ID : {object.id}: \n{e}")
         update_change_reason(object, "Unknown")
-
-
-def decode_bytes(bytes_string):
-    detection_result = chardet.detect(bytes_string)
-    encoding_detected = detection_result["encoding"]
-    logger.info(f"Encoding autodetected : {encoding_detected}")
-    return (bytes_string.decode(encoding_detected), encoding_detected)
diff --git a/common/utils/file_import.py b/common/utils/file_import.py
@@ -0,0 +1,56 @@
+import csv
+import hashlib
+import logging
+
+import chardet
+from django.conf import settings
+from django.core.exceptions import ValidationError
+
+logger = logging.getLogger(__name__)
+
+
+def decode_bytes(bytes_string):
+    detection_result = chardet.detect(bytes_string)
+    encoding_detected = detection_result["encoding"]
+    logger.info(f"Encoding autodetected : {encoding_detected}")
+    return (bytes_string.decode(encoding_detected), encoding_detected)
+
+
+def get_file_size(file):
+    return file.size
+
+
+def validate_file_size(file):
+    if get_file_size(file) > settings.CSV_IMPORT_MAX_SIZE:
+        raise ValidationError(
+            f"Ce fichier est trop grand, merci d'utiliser un fichier de moins de {settings.CSV_IMPORT_MAX_SIZE_PRETTY}"
+        )
+
+
+def get_file_content_type(file):
+    return file.content_type
+
+
+def validate_file_format(file):
+    file_format = get_file_content_type(file)
+    if file_format not in ["text/csv", "text/tab-separated-values"]:
+        raise ValidationError(
+            f"Ce fichier est au format {file_format}, merci d'exporter votre fichier au format CSV et réessayer."
+        )
+
+
+def get_file_digest(file):
+    file_hash = hashlib.md5()
+    for row in file:
+        file_hash.update(row)
+    return file_hash.hexdigest()
+
+
+def get_csv_file_dialect(file):
+    """
+    Possible values: 'excel', 'excel-tab', 'unix'
+    """
+    file.seek(0)
+    row_1 = file.readline()
+    (decoded_row, _) = decode_bytes(row_1)
+    return csv.Sniffer().sniff(decoded_row)
diff --git a/macantine/settings.py b/macantine/settings.py
@@ -420,6 +420,7 @@
 
 # Maximum CSV import file size: 10Mo
 CSV_IMPORT_MAX_SIZE = 10485760
+CSV_IMPORT_MAX_SIZE_PRETTY = "10Mo"
 
 # Size of each chunk when processing files
 CSV_PURCHASE_CHUNK_LINES = 10000