back: feat: break large geom when insterting data

TelesCoop · Feb 6, 2025 · ccd16b7 · ccd16b7
1 parent dfc0267
commit ccd16b7
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 87 deletions.
diff --git a/.github/ISSUE_TEMPLATE/user-story.md b/.github/ISSUE_TEMPLATE/user-story.md
@@ -2,19 +2,21 @@
 name: User Story
 about: Décrire la user story
 title: ETQ dev/utilisateur.ice/SysAdmin/designer je ...
-labels: ''
-assignees: ''
-
+labels: ""
+assignees: ""
 ---
 
-## Merci de décrire une nouvelle user story !  
+## Merci de décrire une nouvelle user story !
+
+### Comportement attendu
+
+**Explication :** Explique brièvement ce que l'utilisateur attend.
+**Obligatoire** ✅
+
+### Contexte
 
-### Comportement attendu  
-**Explication :** Explique brièvement ce que l'utilisateur attend.  
-**Obligatoire** ✅  
+Ajouter ici tout élément de contexte.
 
-### Contexte  
-Ajouter ici tout élément de contexte.  
+### Piste de solution
 
-### Piste de solution  
 Proposez une solution possible.
diff --git a/back/compute_factors.sh b/back/compute_factors.sh
@@ -2,26 +2,23 @@
 
 # Configuration
 input_file="insee.txt"      # File containing the INSEE codes
-num_groups=5                # Number of groups (can be changed)
-task_command="python manage.py c04_compute_factors --insee_code_city" # Base task command
+task_command="python manage.py c04_compute_factors --delete --insee_code_city" # Base task command
+num_parallel_tasks=4        # Number of parallel tasks
 
-# Divide the input file into groups
-total_lines=$(wc -l < "$input_file")
-lines_per_group=$(( (total_lines + num_groups - 1) / num_groups )) # Calculate lines per group, round up
-split -l "$lines_per_group" "$input_file" insee_group_
+# Function to process INSEE codes
+process_insee_code() {
+    local insee_code=$1
+    $task_command "$insee_code"
+}
 
-# Launch tasks in parallel
-for file in insee_group_*; do
-    # Read the codes from the file and join them with commas
-    insee_codes=$(paste -sd, "$file")
-    echo $insee_codes
+# Export function for parallel execution
+export -f process_insee_code
+export task_command
+mkdir -p output
+
+# Run tasks with parallel, limiting concurrency
+parallel -j $num_parallel_tasks "$task_command {} > output/{}.log 2>&1" < "$input_file"
 
-    # Run the command in parallel
-    $task_command "$insee_codes" &
-done
 
 # Wait for all tasks to complete
 wait
-
-# Cleanup
-rm insee_group_*
diff --git a/back/iarbre_data/management/commands/c03_import_data.py b/back/iarbre_data/management/commands/c03_import_data.py
@@ -10,12 +10,15 @@
 from django.contrib.gis.geos import GEOSGeometry
 from django.core.management import BaseCommand
 from shapely import unary_union
+from shapely.geometry import box
 from tqdm import tqdm
 
 from iarbre_data.data_config import DATA_FILES, URL_FILES
 from iarbre_data.models import Data
 from iarbre_data.settings import DATA_DIR, TARGET_PROJ
 
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 
 def batched(iterable, n) -> None:
     """Batch data into tuples of length n. The last batch may be shorter."""
@@ -130,7 +133,7 @@ def process_data(df: gpd.GeoDataFrame, data_config):
     Process geometries.
 
     Args:
-        df (GeoDataFrame): GeoDataFrame to be apply actions on.
+        df (GeoDataFrame): GeoDataFrame to apply actions on.
         data_config (dict): Configuration of the data.
 
     Returns:
@@ -143,6 +146,7 @@ def process_data(df: gpd.GeoDataFrame, data_config):
 
     for actions, factor in actions_factors:
         sub_df = apply_actions(df.copy(), actions)
+        sub_df = split_factor_dataframe(sub_df, grid_size=10000)
         if len(sub_df) == 0:
             print(f"Factor: {factor} only contained Points")
             continue
@@ -178,6 +182,79 @@ def save_geometries(datas, data_config) -> None:
         )
 
 
+def split_large_polygon(geom, grid_size, bounds, factor_crs) -> list:
+    """
+    Split a large polygon into smaller chunks based on a grid.
+
+    Args:
+        geom (shapely.geometry.Polygon): Polygon to split.
+        grid_size (flaot): Size of the grid in meters.
+        bounds (tuple): Bounds of all the factors
+        factor_crs (int): SRID of the factors data
+
+    Returns:
+        list(shapely.geometry.MultiPolygon): MultiPolygon with the split parts.
+    """
+    if geom is None or geom.is_empty:
+        return None
+    # Create a grid covering the bounding box of the geometry
+    minx, miny, maxx, maxy = bounds
+    grid_cells = [
+        box(x, y, x + grid_size, y + grid_size)
+        for x in range(int(minx), int(maxx), grid_size)
+        for y in range(int(miny), int(maxy), grid_size)
+    ]
+    grid_gdf = gpd.GeoDataFrame({"geometry": grid_cells}, crs=factor_crs)
+    split_polygons = []
+    for _, grid_cell in grid_gdf.iterrows():
+        if geom.intersects(grid_cell.geometry):
+            # The intersection between the polygon and the grid cell
+            intersection = geom.intersection(grid_cell.geometry)
+            if intersection.is_valid and not intersection.is_empty:
+                split_polygons.append(intersection)
+    return split_polygons
+
+
+def split_factor_dataframe(factor_df, grid_size=10000) -> gpd.GeoDataFrame:
+    """Split Polygons of each row into smaller ones, following a grid.
+
+    Args:
+        factor_df (geopandas.DataFrame): DataFrame with geometries for a factor
+        grid_size (float): Size of the grid in meters that will be used to break geometries.
+
+    Returns:
+        factor_df (geopandas.GeoDataFrame): New dataframe with smaller geometries.
+    """
+    bounds = factor_df.total_bounds
+    geometries = factor_df.geometry.values
+
+    def split_polygon_parallel(polygon):
+        """Utils"""
+        return split_large_polygon(
+            polygon, grid_size=grid_size, bounds=bounds, factor_crs=factor_df.crs
+        )
+
+    split_geom = []
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        future_to_polygon = {
+            executor.submit(split_polygon_parallel, polygon): polygon
+            for polygon in geometries
+        }
+        for future in as_completed(future_to_polygon):
+            polygon = future_to_polygon.pop(future)
+            try:
+                results = future.result()
+                split_geom.extend(results)
+            except Exception as e:
+                print(f"Error processing polygon {polygon}: {e}")
+
+    # split_geom = [geom for polygon in geometries
+    #               for geom in
+    #               split_large_polygon(polygon, grid_size=grid_size, bounds=bounds, factor_crs=factor_df.crs)]
+
+    return gpd.GeoDataFrame({"geometry": split_geom}, crs=factor_df.crs)
+
+
 class Command(BaseCommand):
     help = "Create grid and save it to DB"
 

diff --git a/back/iarbre_data/management/commands/c04_compute_factors.py b/back/iarbre_data/management/commands/c04_compute_factors.py
@@ -2,10 +2,10 @@
 
 from django.contrib.gis.geos import GEOSGeometry
 from django.core.management import BaseCommand
-from tqdm import tqdm
+from django.db import transaction
 from shapely.geometry import box
-from shapely.ops import unary_union
 import geopandas as gpd
+from tqdm import tqdm
 
 from iarbre_data.data_config import FACTORS
 
@@ -15,6 +15,23 @@
 TILE_BATCH_SIZE = 10_000
 
 
+def has_intersection(geom, tiles_index):
+    """
+    Check if a geometry intersects with any tile.
+
+    Args:
+        geom (shapely.geometry.base.BaseGeometry): Geometry to check.
+        tiles_index (geopandas.GeoDataFrame.sindex):  R-tree spatial index of the tiles.
+
+    Returns:
+        bool: True if the geometry intersects with any tile, False otherwise.
+    """
+    if geom is None or geom.is_empty:
+        return False
+    bounding_box = box(*geom.bounds)
+    return any(tiles_index.query(bounding_box))
+
+
 def _compute_for_factor_partial_tiles(factor_df, tiles_df, std_area):
     """
     Compute and store the proportion of standard tile area occupied by a geographic factor.
@@ -28,65 +45,12 @@ def _compute_for_factor_partial_tiles(factor_df, tiles_df, std_area):
     Returns:
         list[TileFactor]: List of TileFactor objects to be created in the database
     """
-    factor_crs = factor_df.crs
-
-    # Split large factor geometries into smaller ones
-    def split_large_polygon(geom, grid_size=10):
-        """
-        Split a large polygon into smaller chunks based on a grid.
-
-        Args:
-            geom (shapely.geometry.Polygon): Polygon to split.
-            grid_size (int): Size of the grid in meters.
-
-        Returns:
-            list[shapely.geometry.MultiPolygon]: MultiPolygon with the split parts.
-        """
-        if geom is None or geom.is_empty:
-            return None
-        # Create a grid covering the bounding box of the geometry
-        minx, miny, maxx, maxy = tiles_df.geometry.total_bounds
-        grid_cells = [
-            box(x, y, x + grid_size, y + grid_size)
-            for x in range(int(minx), int(maxx), grid_size)
-            for y in range(int(miny), int(maxy), grid_size)
-        ]
-        clipped_parts = [
-            geom.intersection(cell) for cell in grid_cells if geom.intersects(cell)
-        ]
-        return unary_union([part for part in clipped_parts if not part.is_empty])
-
-    factor_df["geometry"] = factor_df["geometry"].apply(
-        lambda g: split_large_polygon(g, grid_size=4000) if g and not g.is_empty else g
-    )
-
-    flattened_geometries = []
-    for idx, row in factor_df.iterrows():
-        flattened_geometries.append({"geometry": row["geometry"], "original_id": idx})
-
-    factor_df = gpd.GeoDataFrame(
-        flattened_geometries, geometry="geometry", crs=factor_crs
-    )
-
     # Filter polygons in the bounding box of the tiles
     tiles_index = tiles_df.sindex
 
-    def has_intersection(geom):
-        """
-        Check if a geometry intersects with any tile.
-
-        Args:
-            geom (shapely.geometry.base.BaseGeometry): Geometry to check.
-
-        Returns:
-            bool: True if the geometry intersects with any tile, False otherwise.
-        """
-        if geom is None or geom.is_empty:
-            return False
-        bounding_box = box(*geom.bounds)
-        return any(tiles_index.query(bounding_box))
-
-    idx_intersect = factor_df.geometry.apply(has_intersection)
+    idx_intersect = factor_df.geometry.apply(
+        lambda geom: has_intersection(geom, tiles_index)
+    )
     possible_matches = factor_df[idx_intersect].copy()
     if len(possible_matches) > 0:
         df = tiles_df.clip(possible_matches)
@@ -119,6 +83,7 @@ def compute_for_factor(factor_name, tiles_df, std_area) -> None:
     if not qs.exists():
         return
     factor_df = load_geodataframe_from_db(qs, [])
+
     # compute and store by batch of 10k tiles
     n_batches = len(tiles_df) // TILE_BATCH_SIZE + 1
     for batch in tqdm(
@@ -140,7 +105,8 @@ def compute_for_factor(factor_name, tiles_df, std_area) -> None:
                 for row in df.itertuples(index=False)
                 if (row.id, factor_name) not in existing_pairs
             ]
-            TileFactor.objects.bulk_create(tile_factors)
+            with transaction.atomic():
+                TileFactor.objects.bulk_create(tile_factors)
 
 
 def process_city(city, FACTORS, std_area, delete) -> None: