Skip to content

Commit

Permalink
back: feat: break large geom when insterting data
Browse files Browse the repository at this point in the history
  • Loading branch information
ludovicdmt committed Feb 6, 2025
1 parent dfc0267 commit ccd16b7
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 87 deletions.
22 changes: 12 additions & 10 deletions .github/ISSUE_TEMPLATE/user-story.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
name: User Story
about: Décrire la user story
title: ETQ dev/utilisateur.ice/SysAdmin/designer je ...
labels: ''
assignees: ''

labels: ""
assignees: ""
---

## Merci de décrire une nouvelle user story !
## Merci de décrire une nouvelle user story !

### Comportement attendu

**Explication :** Explique brièvement ce que l'utilisateur attend.
**Obligatoire**

### Contexte

### Comportement attendu
**Explication :** Explique brièvement ce que l'utilisateur attend.
**Obligatoire**
Ajouter ici tout élément de contexte.

### Contexte
Ajouter ici tout élément de contexte.
### Piste de solution

### Piste de solution
Proposez une solution possible.
31 changes: 14 additions & 17 deletions back/compute_factors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,23 @@

# Configuration
input_file="insee.txt" # File containing the INSEE codes
num_groups=5 # Number of groups (can be changed)
task_command="python manage.py c04_compute_factors --insee_code_city" # Base task command
task_command="python manage.py c04_compute_factors --delete --insee_code_city" # Base task command
num_parallel_tasks=4 # Number of parallel tasks

# Divide the input file into groups
total_lines=$(wc -l < "$input_file")
lines_per_group=$(( (total_lines + num_groups - 1) / num_groups )) # Calculate lines per group, round up
split -l "$lines_per_group" "$input_file" insee_group_
# Function to process INSEE codes
process_insee_code() {
local insee_code=$1
$task_command "$insee_code"
}

# Launch tasks in parallel
for file in insee_group_*; do
# Read the codes from the file and join them with commas
insee_codes=$(paste -sd, "$file")
echo $insee_codes
# Export function for parallel execution
export -f process_insee_code
export task_command
mkdir -p output

# Run tasks with parallel, limiting concurrency
parallel -j $num_parallel_tasks "$task_command {} > output/{}.log 2>&1" < "$input_file"

# Run the command in parallel
$task_command "$insee_codes" &
done

# Wait for all tasks to complete
wait

# Cleanup
rm insee_group_*
79 changes: 78 additions & 1 deletion back/iarbre_data/management/commands/c03_import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
from django.contrib.gis.geos import GEOSGeometry
from django.core.management import BaseCommand
from shapely import unary_union
from shapely.geometry import box
from tqdm import tqdm

from iarbre_data.data_config import DATA_FILES, URL_FILES
from iarbre_data.models import Data
from iarbre_data.settings import DATA_DIR, TARGET_PROJ

from concurrent.futures import ThreadPoolExecutor, as_completed


def batched(iterable, n) -> None:
"""Batch data into tuples of length n. The last batch may be shorter."""
Expand Down Expand Up @@ -130,7 +133,7 @@ def process_data(df: gpd.GeoDataFrame, data_config):
Process geometries.
Args:
df (GeoDataFrame): GeoDataFrame to be apply actions on.
df (GeoDataFrame): GeoDataFrame to apply actions on.
data_config (dict): Configuration of the data.
Returns:
Expand All @@ -143,6 +146,7 @@ def process_data(df: gpd.GeoDataFrame, data_config):

for actions, factor in actions_factors:
sub_df = apply_actions(df.copy(), actions)
sub_df = split_factor_dataframe(sub_df, grid_size=10000)
if len(sub_df) == 0:
print(f"Factor: {factor} only contained Points")
continue
Expand Down Expand Up @@ -178,6 +182,79 @@ def save_geometries(datas, data_config) -> None:
)


def split_large_polygon(geom, grid_size, bounds, factor_crs) -> list:
"""
Split a large polygon into smaller chunks based on a grid.
Args:
geom (shapely.geometry.Polygon): Polygon to split.
grid_size (flaot): Size of the grid in meters.
bounds (tuple): Bounds of all the factors
factor_crs (int): SRID of the factors data
Returns:
list(shapely.geometry.MultiPolygon): MultiPolygon with the split parts.
"""
if geom is None or geom.is_empty:
return None
# Create a grid covering the bounding box of the geometry
minx, miny, maxx, maxy = bounds
grid_cells = [
box(x, y, x + grid_size, y + grid_size)
for x in range(int(minx), int(maxx), grid_size)
for y in range(int(miny), int(maxy), grid_size)
]
grid_gdf = gpd.GeoDataFrame({"geometry": grid_cells}, crs=factor_crs)
split_polygons = []
for _, grid_cell in grid_gdf.iterrows():
if geom.intersects(grid_cell.geometry):
# The intersection between the polygon and the grid cell
intersection = geom.intersection(grid_cell.geometry)
if intersection.is_valid and not intersection.is_empty:
split_polygons.append(intersection)
return split_polygons


def split_factor_dataframe(factor_df, grid_size=10000) -> gpd.GeoDataFrame:
"""Split Polygons of each row into smaller ones, following a grid.
Args:
factor_df (geopandas.DataFrame): DataFrame with geometries for a factor
grid_size (float): Size of the grid in meters that will be used to break geometries.
Returns:
factor_df (geopandas.GeoDataFrame): New dataframe with smaller geometries.
"""
bounds = factor_df.total_bounds
geometries = factor_df.geometry.values

def split_polygon_parallel(polygon):
"""Utils"""
return split_large_polygon(
polygon, grid_size=grid_size, bounds=bounds, factor_crs=factor_df.crs
)

split_geom = []
with ThreadPoolExecutor(max_workers=4) as executor:
future_to_polygon = {
executor.submit(split_polygon_parallel, polygon): polygon
for polygon in geometries
}
for future in as_completed(future_to_polygon):
polygon = future_to_polygon.pop(future)
try:
results = future.result()
split_geom.extend(results)
except Exception as e:
print(f"Error processing polygon {polygon}: {e}")

# split_geom = [geom for polygon in geometries
# for geom in
# split_large_polygon(polygon, grid_size=grid_size, bounds=bounds, factor_crs=factor_df.crs)]

return gpd.GeoDataFrame({"geometry": split_geom}, crs=factor_df.crs)


class Command(BaseCommand):
help = "Create grid and save it to DB"

Expand Down
84 changes: 25 additions & 59 deletions back/iarbre_data/management/commands/c04_compute_factors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

from django.contrib.gis.geos import GEOSGeometry
from django.core.management import BaseCommand
from tqdm import tqdm
from django.db import transaction
from shapely.geometry import box
from shapely.ops import unary_union
import geopandas as gpd
from tqdm import tqdm

from iarbre_data.data_config import FACTORS

Expand All @@ -15,6 +15,23 @@
TILE_BATCH_SIZE = 10_000


def has_intersection(geom, tiles_index):
"""
Check if a geometry intersects with any tile.
Args:
geom (shapely.geometry.base.BaseGeometry): Geometry to check.
tiles_index (geopandas.GeoDataFrame.sindex): R-tree spatial index of the tiles.
Returns:
bool: True if the geometry intersects with any tile, False otherwise.
"""
if geom is None or geom.is_empty:
return False
bounding_box = box(*geom.bounds)
return any(tiles_index.query(bounding_box))


def _compute_for_factor_partial_tiles(factor_df, tiles_df, std_area):
"""
Compute and store the proportion of standard tile area occupied by a geographic factor.
Expand All @@ -28,65 +45,12 @@ def _compute_for_factor_partial_tiles(factor_df, tiles_df, std_area):
Returns:
list[TileFactor]: List of TileFactor objects to be created in the database
"""
factor_crs = factor_df.crs

# Split large factor geometries into smaller ones
def split_large_polygon(geom, grid_size=10):
"""
Split a large polygon into smaller chunks based on a grid.
Args:
geom (shapely.geometry.Polygon): Polygon to split.
grid_size (int): Size of the grid in meters.
Returns:
list[shapely.geometry.MultiPolygon]: MultiPolygon with the split parts.
"""
if geom is None or geom.is_empty:
return None
# Create a grid covering the bounding box of the geometry
minx, miny, maxx, maxy = tiles_df.geometry.total_bounds
grid_cells = [
box(x, y, x + grid_size, y + grid_size)
for x in range(int(minx), int(maxx), grid_size)
for y in range(int(miny), int(maxy), grid_size)
]
clipped_parts = [
geom.intersection(cell) for cell in grid_cells if geom.intersects(cell)
]
return unary_union([part for part in clipped_parts if not part.is_empty])

factor_df["geometry"] = factor_df["geometry"].apply(
lambda g: split_large_polygon(g, grid_size=4000) if g and not g.is_empty else g
)

flattened_geometries = []
for idx, row in factor_df.iterrows():
flattened_geometries.append({"geometry": row["geometry"], "original_id": idx})

factor_df = gpd.GeoDataFrame(
flattened_geometries, geometry="geometry", crs=factor_crs
)

# Filter polygons in the bounding box of the tiles
tiles_index = tiles_df.sindex

def has_intersection(geom):
"""
Check if a geometry intersects with any tile.
Args:
geom (shapely.geometry.base.BaseGeometry): Geometry to check.
Returns:
bool: True if the geometry intersects with any tile, False otherwise.
"""
if geom is None or geom.is_empty:
return False
bounding_box = box(*geom.bounds)
return any(tiles_index.query(bounding_box))

idx_intersect = factor_df.geometry.apply(has_intersection)
idx_intersect = factor_df.geometry.apply(
lambda geom: has_intersection(geom, tiles_index)
)
possible_matches = factor_df[idx_intersect].copy()
if len(possible_matches) > 0:
df = tiles_df.clip(possible_matches)
Expand Down Expand Up @@ -119,6 +83,7 @@ def compute_for_factor(factor_name, tiles_df, std_area) -> None:
if not qs.exists():
return
factor_df = load_geodataframe_from_db(qs, [])

# compute and store by batch of 10k tiles
n_batches = len(tiles_df) // TILE_BATCH_SIZE + 1
for batch in tqdm(
Expand All @@ -140,7 +105,8 @@ def compute_for_factor(factor_name, tiles_df, std_area) -> None:
for row in df.itertuples(index=False)
if (row.id, factor_name) not in existing_pairs
]
TileFactor.objects.bulk_create(tile_factors)
with transaction.atomic():
TileFactor.objects.bulk_create(tile_factors)


def process_city(city, FACTORS, std_area, delete) -> None:
Expand Down

0 comments on commit ccd16b7

Please sign in to comment.