From a4834fce4d4bbd38f6a33296b9838b8539f0bf4b Mon Sep 17 00:00:00 2001
From: KriWay <kris.vanwayenberge@lv.vlaanderen.be>
Date: Wed, 2 Oct 2024 11:07:16 +0200
Subject: [PATCH 1/2] refactor: rename variables

---
 cropclassification/general.ini                | 28 +++++++++++
 .../preprocess/_timeseries_calc_openeo.py     |  2 +-
 .../util/zonal_stats_bulk/__init__.py         |  9 +++-
 .../_zonal_stats_bulk_exactextract.py         | 46 +++++++++++++++----
 tests/test_zonal_stats_bulk.py                | 37 +++++++++++++++
 5 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/cropclassification/general.ini b/cropclassification/general.ini
index 2ba6299..99bc4cd 100644
--- a/cropclassification/general.ini
+++ b/cropclassification/general.ini
@@ -147,6 +147,34 @@ max_cloudcover_pct = 15
 # The min percentage of parcels that need to have valid data for a time+sensor to use it 
 min_parcels_with_data_pct = 80
 
+# spatial_aggregations to use in zonal statistics calcuation.
+spatial_aggregations = count, mean, median, std, min, max
+
+# spatial_aggregation arguments to use with exactextract.
+# https://isciences.github.io/exactextract/operations.html
+# Possible values are:
+#   - coverage_weight: specifies the value to use for in the above formulas.
+#       The following methods are available:
+#           - fraction: the default method, with ranging from 0 to 1
+#           - none: is always equal to 1; all pixels are given the same weight
+#                in the above calculations regardless of their coverage fraction
+#           - area_cartesian: is the fraction of the pixel
+#                multiplied by it x and y resolutions
+#           - area_spherical_m2: is the fraction of the pixel that is covered
+#                multiplied by a spherical approximation of the cell’s area
+#                in square meters
+#           - area_spherical_km2: is the fraction of the pixel that is covered
+#                multiplied by a spherical approximation of the cell’s area
+#                in square kilometers
+#   - default_value: specifies a value to be used for NODATA pixels
+#       instead of ignoring them
+#   - default_weight: specifies a weighing value to be used for NODATA pixels
+#       instead of ignoring them
+#   - min_coverage_frac: specifies the minimum fraction of the pixel (0 to 1)
+#       that must be covered in order for a pixel to be considered in calculations.
+#       Defaults to 0.
+spatial_aggregation_args = {"min_coverage_frac": "0.5"}
+
 # Configuration specific to the marker being calculated.
 [marker]
 
diff --git a/cropclassification/preprocess/_timeseries_calc_openeo.py b/cropclassification/preprocess/_timeseries_calc_openeo.py
index 2623dc7..375afcd 100644
--- a/cropclassification/preprocess/_timeseries_calc_openeo.py
+++ b/cropclassification/preprocess/_timeseries_calc_openeo.py
@@ -79,7 +79,7 @@ def calculate_periodic_timeseries(
         id_column=conf.columns["id"],
         rasters_bands=images_bands,
         output_dir=timeseries_periodic_dir,
-        stats=["count", "mean", "median", "std", "min", "max"],  # type: ignore[arg-type]
+        stats=conf.timeseries.getlist("spatial_aggregations"),
         engine="pyqgis",
         nb_parallel=nb_parallel,
     )
diff --git a/cropclassification/util/zonal_stats_bulk/__init__.py b/cropclassification/util/zonal_stats_bulk/__init__.py
index 5b9aaa1..fc30c11 100644
--- a/cropclassification/util/zonal_stats_bulk/__init__.py
+++ b/cropclassification/util/zonal_stats_bulk/__init__.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 from typing import Optional, Union
 
+from cropclassification.helpers import config_helper as conf
+
 from ._raster_helper import *  # noqa: F403
 
 
@@ -81,7 +83,12 @@ def zonal_stats(
             vector_path=vector_path,
             rasters_bands=rasters_bands,
             output_dir=output_dir,
-            stats=stats,
+            stats={
+                "spatial_aggregations": stats,
+                "spatial_aggregation_args": conf.timeseries.getdict(
+                    "spatial_aggregation_args"
+                ),
+            },
             columns=[id_column],
             nb_parallel=nb_parallel,
             force=force,
diff --git a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
index 19a72a5..614d47a 100644
--- a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
+++ b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
@@ -6,7 +6,7 @@
 from concurrent import futures
 from datetime import datetime
 from pathlib import Path
-from typing import Union
+from typing import Any, Union
 
 import geopandas as gpd
 import pandas as pd
@@ -28,7 +28,7 @@ def zonal_stats(
     columns: list[str],
     rasters_bands: list[tuple[Path, list[str]]],
     output_dir: Path,
-    stats: list[str],
+    stats: dict[str, Any],
     nb_parallel: int = -1,
     force: bool = False,
 ):
@@ -212,7 +212,7 @@ def zonal_stats_band(
     vector_path,
     raster_path: Path,
     tmp_dir: Path,
-    stats: list[str],
+    stats: dict[str, Any],
     include_cols: list[str],
 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
     # Get the image info
@@ -233,7 +233,7 @@ def zonal_stats_band(
         stats_df = exactextract.exact_extract(
             rast=raster_path,
             vec=vector_proj_path,
-            ops=stats,
+            ops=get_ops(stats),
             include_geom=False,
             output="pandas",
             include_cols=include_cols,
@@ -251,7 +251,7 @@ def zonal_stats_band_tofile(
     output_paths: dict[str, Path],
     bands: list[str],
     tmp_dir: Path,
-    stats: list[str],
+    stats: dict[str, Any],
     include_cols: list[str],
     force: bool = False,
 ) -> dict[str, Path]:
@@ -275,10 +275,14 @@ def zonal_stats_band_tofile(
     for band in bands:
         index = raster_info.bands[band].band_index
         band_columns = include_cols.copy()
-        band_columns.extend([f"band_{index}_{stat}" for stat in stats])
+        band_columns.extend(
+            [f"band_{index}_{stat}" for stat in stats["spatial_aggregations"]]
+        )
         band_stats_df = stats_df[band_columns].copy()
         band_stats_df.rename(
-            columns={f"band_{index}_{stat}": stat for stat in stats},
+            columns={
+                f"band_{index}_{stat}": stat for stat in stats["spatial_aggregations"]
+            },
             inplace=True,
         )
         # Add fid column to the beginning of the dataframe
@@ -289,6 +293,32 @@ def zonal_stats_band_tofile(
                 f"Write data for {len(band_stats_df.index)} parcels found to {output_paths[band]}"  # noqa: E501
             )
             if not output_paths[band].exists():
-                pdh.to_file(band_stats_df, output_paths[band], index=False)
+                # Write the info table to the output file
+                pdh.to_file(df=band_stats_df, path=output_paths[band], index=False)
+
+                # Write the parameters table to the output file
+                spatial_aggregation_args = []
+                for key, value in stats["spatial_aggregation_args"].items():
+                    spatial_aggregation_args.append([key, value])
+                spatial_aggregation_args_df = pd.DataFrame(
+                    data=spatial_aggregation_args, columns=["ops", "value"]
+                )
+                pdh.to_file(
+                    df=spatial_aggregation_args_df,
+                    path=output_paths[band],
+                    table_name="params",
+                    index=False,
+                    append=True,
+                )
 
     return output_paths
+
+
+def get_ops(stats: dict[str, Any]) -> list[str]:
+    spatial_aggregation_args = stats["spatial_aggregation_args"]
+    ops_params_list = [
+        f"{y}={spatial_aggregation_args[y]}" for y in spatial_aggregation_args
+    ]
+    ops_params_string = ",".join(ops_params_list)
+
+    return [f"{x}({ops_params_string})" for x in stats["spatial_aggregations"]]
diff --git a/tests/test_zonal_stats_bulk.py b/tests/test_zonal_stats_bulk.py
index 530bbde..cc651aa 100644
--- a/tests/test_zonal_stats_bulk.py
+++ b/tests/test_zonal_stats_bulk.py
@@ -3,8 +3,12 @@
 import geofileops as gfo
 import pytest
 
+from cropclassification.helpers import config_helper as conf
 from cropclassification.helpers import pandas_helper as pdh
 from cropclassification.util import zonal_stats_bulk
+from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_exactextract import (
+    get_ops,
+)
 from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_pyqgis import HAS_QGIS
 from tests.test_helper import SampleData
 
@@ -19,6 +23,16 @@ def test_zonal_stats_bulk(tmp_path, engine):
     test_dir = tmp_path / sample_dir.name
     shutil.copytree(sample_dir, test_dir)
 
+    # Read the configuration
+    config_paths = [
+        SampleData.config_dir / "cropgroup.ini",
+        SampleData.tasks_dir / "local_overrule.ini",
+    ]
+    conf.read_config(
+        config_paths=config_paths,
+        default_basedir=SampleData.marker_basedir,
+    )
+
     # Make sure the s2-agri input file was copied
     test_image_roi_dir = test_dir / SampleData.image_dir.name / SampleData.roi_name
     test_s1_asc_dir = test_image_roi_dir / SampleData.image_s1_asc_path.parent.name
@@ -46,3 +60,26 @@ def test_zonal_stats_bulk(tmp_path, engine):
         assert len(result_df) == vector_info.featurecount
         # The calculates stats should not be nan for any row.
         assert not any(result_df["mean"].isna())
+
+
+def test_spatial_aggregations():
+    # Read the configuration
+    config_paths = [
+        SampleData.config_dir / "cropgroup.ini",
+        SampleData.tasks_dir / "local_overrule.ini",
+    ]
+    conf.read_config(
+        config_paths=config_paths,
+        default_basedir=SampleData.marker_basedir,
+    )
+    spatial_aggregations = conf.timeseries.getlist("spatial_aggregations")
+    spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args")
+
+    ops = get_ops(
+        {
+            "spatial_aggregations": spatial_aggregations,
+            "spatial_aggregation_args": spatial_aggregation_args,
+        }
+    )
+
+    assert len(ops) == 6

From cd91a8ada92aec60236e14190789f8442bd6c116 Mon Sep 17 00:00:00 2001
From: KriWay <kris.vanwayenberge@lv.vlaanderen.be>
Date: Mon, 14 Oct 2024 08:53:45 +0200
Subject: [PATCH 2/2] refactor aggregation parameters

---
 cropclassification/general.ini                | 28 ---------
 .../preprocess/_timeseries_calc_openeo.py     |  2 +-
 .../util/zonal_stats_bulk/__init__.py         |  9 +--
 .../_zonal_stats_bulk_exactextract.py         | 30 +++-------
 tests/test_zonal_stats_bulk.py                | 60 +++++++++++--------
 5 files changed, 46 insertions(+), 83 deletions(-)

diff --git a/cropclassification/general.ini b/cropclassification/general.ini
index 99bc4cd..2ba6299 100644
--- a/cropclassification/general.ini
+++ b/cropclassification/general.ini
@@ -147,34 +147,6 @@ max_cloudcover_pct = 15
 # The min percentage of parcels that need to have valid data for a time+sensor to use it 
 min_parcels_with_data_pct = 80
 
-# spatial_aggregations to use in zonal statistics calcuation.
-spatial_aggregations = count, mean, median, std, min, max
-
-# spatial_aggregation arguments to use with exactextract.
-# https://isciences.github.io/exactextract/operations.html
-# Possible values are:
-#   - coverage_weight: specifies the value to use for in the above formulas.
-#       The following methods are available:
-#           - fraction: the default method, with ranging from 0 to 1
-#           - none: is always equal to 1; all pixels are given the same weight
-#                in the above calculations regardless of their coverage fraction
-#           - area_cartesian: is the fraction of the pixel
-#                multiplied by it x and y resolutions
-#           - area_spherical_m2: is the fraction of the pixel that is covered
-#                multiplied by a spherical approximation of the cell’s area
-#                in square meters
-#           - area_spherical_km2: is the fraction of the pixel that is covered
-#                multiplied by a spherical approximation of the cell’s area
-#                in square kilometers
-#   - default_value: specifies a value to be used for NODATA pixels
-#       instead of ignoring them
-#   - default_weight: specifies a weighing value to be used for NODATA pixels
-#       instead of ignoring them
-#   - min_coverage_frac: specifies the minimum fraction of the pixel (0 to 1)
-#       that must be covered in order for a pixel to be considered in calculations.
-#       Defaults to 0.
-spatial_aggregation_args = {"min_coverage_frac": "0.5"}
-
 # Configuration specific to the marker being calculated.
 [marker]
 
diff --git a/cropclassification/preprocess/_timeseries_calc_openeo.py b/cropclassification/preprocess/_timeseries_calc_openeo.py
index 375afcd..a78fefe 100644
--- a/cropclassification/preprocess/_timeseries_calc_openeo.py
+++ b/cropclassification/preprocess/_timeseries_calc_openeo.py
@@ -79,7 +79,7 @@ def calculate_periodic_timeseries(
         id_column=conf.columns["id"],
         rasters_bands=images_bands,
         output_dir=timeseries_periodic_dir,
-        stats=conf.timeseries.getlist("spatial_aggregations"),
+        stats=["count", "mean", "median", "std", "min", "max"],
         engine="pyqgis",
         nb_parallel=nb_parallel,
     )
diff --git a/cropclassification/util/zonal_stats_bulk/__init__.py b/cropclassification/util/zonal_stats_bulk/__init__.py
index fc30c11..5b9aaa1 100644
--- a/cropclassification/util/zonal_stats_bulk/__init__.py
+++ b/cropclassification/util/zonal_stats_bulk/__init__.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 from typing import Optional, Union
 
-from cropclassification.helpers import config_helper as conf
-
 from ._raster_helper import *  # noqa: F403
 
 
@@ -83,12 +81,7 @@ def zonal_stats(
             vector_path=vector_path,
             rasters_bands=rasters_bands,
             output_dir=output_dir,
-            stats={
-                "spatial_aggregations": stats,
-                "spatial_aggregation_args": conf.timeseries.getdict(
-                    "spatial_aggregation_args"
-                ),
-            },
+            stats=stats,
             columns=[id_column],
             nb_parallel=nb_parallel,
             force=force,
diff --git a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
index 614d47a..c18430d 100644
--- a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
+++ b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py
@@ -6,7 +6,7 @@
 from concurrent import futures
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Union
+from typing import Union
 
 import geopandas as gpd
 import pandas as pd
@@ -28,7 +28,7 @@ def zonal_stats(
     columns: list[str],
     rasters_bands: list[tuple[Path, list[str]]],
     output_dir: Path,
-    stats: dict[str, Any],
+    stats: list[str],
     nb_parallel: int = -1,
     force: bool = False,
 ):
@@ -212,7 +212,7 @@ def zonal_stats_band(
     vector_path,
     raster_path: Path,
     tmp_dir: Path,
-    stats: dict[str, Any],
+    stats: list[str],
     include_cols: list[str],
 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
     # Get the image info
@@ -233,7 +233,7 @@ def zonal_stats_band(
         stats_df = exactextract.exact_extract(
             rast=raster_path,
             vec=vector_proj_path,
-            ops=get_ops(stats),
+            ops=stats,
             include_geom=False,
             output="pandas",
             include_cols=include_cols,
@@ -251,7 +251,7 @@ def zonal_stats_band_tofile(
     output_paths: dict[str, Path],
     bands: list[str],
     tmp_dir: Path,
-    stats: dict[str, Any],
+    stats: list[str],
     include_cols: list[str],
     force: bool = False,
 ) -> dict[str, Path]:
@@ -276,12 +276,13 @@ def zonal_stats_band_tofile(
         index = raster_info.bands[band].band_index
         band_columns = include_cols.copy()
         band_columns.extend(
-            [f"band_{index}_{stat}" for stat in stats["spatial_aggregations"]]
+            [f"band_{index}_{stat}" for stat in [stat.split("(")[0] for stat in stats]]
         )
         band_stats_df = stats_df[band_columns].copy()
         band_stats_df.rename(
             columns={
-                f"band_{index}_{stat}": stat for stat in stats["spatial_aggregations"]
+                f"band_{index}_{stat}": stat
+                for stat in [stat.split("(")[0] for stat in stats]
             },
             inplace=True,
         )
@@ -297,11 +298,8 @@ def zonal_stats_band_tofile(
                 pdh.to_file(df=band_stats_df, path=output_paths[band], index=False)
 
                 # Write the parameters table to the output file
-                spatial_aggregation_args = []
-                for key, value in stats["spatial_aggregation_args"].items():
-                    spatial_aggregation_args.append([key, value])
                 spatial_aggregation_args_df = pd.DataFrame(
-                    data=spatial_aggregation_args, columns=["ops", "value"]
+                    data=stats, columns=["stats"]
                 )
                 pdh.to_file(
                     df=spatial_aggregation_args_df,
@@ -312,13 +310,3 @@ def zonal_stats_band_tofile(
                 )
 
     return output_paths
-
-
-def get_ops(stats: dict[str, Any]) -> list[str]:
-    spatial_aggregation_args = stats["spatial_aggregation_args"]
-    ops_params_list = [
-        f"{y}={spatial_aggregation_args[y]}" for y in spatial_aggregation_args
-    ]
-    ops_params_string = ",".join(ops_params_list)
-
-    return [f"{x}({ops_params_string})" for x in stats["spatial_aggregations"]]
diff --git a/tests/test_zonal_stats_bulk.py b/tests/test_zonal_stats_bulk.py
index cc651aa..08687ca 100644
--- a/tests/test_zonal_stats_bulk.py
+++ b/tests/test_zonal_stats_bulk.py
@@ -6,15 +6,25 @@
 from cropclassification.helpers import config_helper as conf
 from cropclassification.helpers import pandas_helper as pdh
 from cropclassification.util import zonal_stats_bulk
-from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_exactextract import (
-    get_ops,
-)
 from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_pyqgis import HAS_QGIS
 from tests.test_helper import SampleData
 
 
-@pytest.mark.parametrize("engine", ["pyqgis", "rasterstats", "exactextract"])
-def test_zonal_stats_bulk(tmp_path, engine):
+@pytest.mark.parametrize(
+    "engine, stats",
+    [
+        ("pyqgis", ["mean", "count"]),
+        ("rasterstats", ["mean", "count"]),
+        (
+            "exactextract",
+            [
+                "mean(min_coverage_frac=0.5,coverage_weight=none)",
+                "count(min_coverage_frac=0.5,coverage_weight=none)",
+            ],
+        ),
+    ],
+)
+def test_zonal_stats_bulk(tmp_path, engine, stats):
     if engine == "pyqgis" and not HAS_QGIS:
         pytest.skip("QGIS is not available on this system.")
 
@@ -48,7 +58,7 @@ def test_zonal_stats_bulk(tmp_path, engine):
         id_column="UID",
         rasters_bands=images_bands,
         output_dir=tmp_path,
-        stats=["mean", "count"],
+        stats=stats,
         engine=engine,
     )
 
@@ -62,24 +72,24 @@ def test_zonal_stats_bulk(tmp_path, engine):
         assert not any(result_df["mean"].isna())
 
 
-def test_spatial_aggregations():
-    # Read the configuration
-    config_paths = [
-        SampleData.config_dir / "cropgroup.ini",
-        SampleData.tasks_dir / "local_overrule.ini",
-    ]
-    conf.read_config(
-        config_paths=config_paths,
-        default_basedir=SampleData.marker_basedir,
-    )
-    spatial_aggregations = conf.timeseries.getlist("spatial_aggregations")
-    spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args")
+# def test_spatial_aggregations():
+#     # Read the configuration
+#     config_paths = [
+#         SampleData.config_dir / "cropgroup.ini",
+#         SampleData.tasks_dir / "local_overrule.ini",
+#     ]
+#     conf.read_config(
+#         config_paths=config_paths,
+#         default_basedir=SampleData.marker_basedir,
+#     )
+#     spatial_aggregations = conf.timeseries.getlist("spatial_aggregations")
+#     spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args")
 
-    ops = get_ops(
-        {
-            "spatial_aggregations": spatial_aggregations,
-            "spatial_aggregation_args": spatial_aggregation_args,
-        }
-    )
+#     ops = get_ops(
+#         {
+#             "spatial_aggregations": spatial_aggregations,
+#             "spatial_aggregation_args": spatial_aggregation_args,
+#         }
+#     )
 
-    assert len(ops) == 6
+#     assert len(ops) == 6