From a4834fce4d4bbd38f6a33296b9838b8539f0bf4b Mon Sep 17 00:00:00 2001 From: KriWay Date: Wed, 2 Oct 2024 11:07:16 +0200 Subject: [PATCH 1/2] refactor: rename variables --- cropclassification/general.ini | 28 +++++++++++ .../preprocess/_timeseries_calc_openeo.py | 2 +- .../util/zonal_stats_bulk/__init__.py | 9 +++- .../_zonal_stats_bulk_exactextract.py | 46 +++++++++++++++---- tests/test_zonal_stats_bulk.py | 37 +++++++++++++++ 5 files changed, 112 insertions(+), 10 deletions(-) diff --git a/cropclassification/general.ini b/cropclassification/general.ini index 2ba6299..99bc4cd 100644 --- a/cropclassification/general.ini +++ b/cropclassification/general.ini @@ -147,6 +147,34 @@ max_cloudcover_pct = 15 # The min percentage of parcels that need to have valid data for a time+sensor to use it min_parcels_with_data_pct = 80 +# spatial_aggregations to use in zonal statistics calcuation. +spatial_aggregations = count, mean, median, std, min, max + +# spatial_aggregation arguments to use with exactextract. +# https://isciences.github.io/exactextract/operations.html +# Possible values are: +# - coverage_weight: specifies the value to use for in the above formulas. +# The following methods are available: +# - fraction: the default method, with ranging from 0 to 1 +# - none: is always equal to 1; all pixels are given the same weight +# in the above calculations regardless of their coverage fraction +# - area_cartesian: is the fraction of the pixel +# multiplied by it x and y resolutions +# - area_spherical_m2: is the fraction of the pixel that is covered +# multiplied by a spherical approximation of the cell’s area +# in square meters +# - area_spherical_km2: is the fraction of the pixel that is covered +# multiplied by a spherical approximation of the cell’s area +# in square kilometers +# - default_value: specifies a value to be used for NODATA pixels +# instead of ignoring them +# - default_weight: specifies a weighing value to be used for NODATA pixels +# instead of ignoring them +# - min_coverage_frac: specifies the minimum fraction of the pixel (0 to 1) +# that must be covered in order for a pixel to be considered in calculations. +# Defaults to 0. +spatial_aggregation_args = {"min_coverage_frac": "0.5"} + # Configuration specific to the marker being calculated. [marker] diff --git a/cropclassification/preprocess/_timeseries_calc_openeo.py b/cropclassification/preprocess/_timeseries_calc_openeo.py index 2623dc7..375afcd 100644 --- a/cropclassification/preprocess/_timeseries_calc_openeo.py +++ b/cropclassification/preprocess/_timeseries_calc_openeo.py @@ -79,7 +79,7 @@ def calculate_periodic_timeseries( id_column=conf.columns["id"], rasters_bands=images_bands, output_dir=timeseries_periodic_dir, - stats=["count", "mean", "median", "std", "min", "max"], # type: ignore[arg-type] + stats=conf.timeseries.getlist("spatial_aggregations"), engine="pyqgis", nb_parallel=nb_parallel, ) diff --git a/cropclassification/util/zonal_stats_bulk/__init__.py b/cropclassification/util/zonal_stats_bulk/__init__.py index 5b9aaa1..fc30c11 100644 --- a/cropclassification/util/zonal_stats_bulk/__init__.py +++ b/cropclassification/util/zonal_stats_bulk/__init__.py @@ -1,6 +1,8 @@ from pathlib import Path from typing import Optional, Union +from cropclassification.helpers import config_helper as conf + from ._raster_helper import * # noqa: F403 @@ -81,7 +83,12 @@ def zonal_stats( vector_path=vector_path, rasters_bands=rasters_bands, output_dir=output_dir, - stats=stats, + stats={ + "spatial_aggregations": stats, + "spatial_aggregation_args": conf.timeseries.getdict( + "spatial_aggregation_args" + ), + }, columns=[id_column], nb_parallel=nb_parallel, force=force, diff --git a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py index 19a72a5..614d47a 100644 --- a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py +++ b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py @@ -6,7 +6,7 @@ from concurrent import futures from datetime import datetime from pathlib import Path -from typing import Union +from typing import Any, Union import geopandas as gpd import pandas as pd @@ -28,7 +28,7 @@ def zonal_stats( columns: list[str], rasters_bands: list[tuple[Path, list[str]]], output_dir: Path, - stats: list[str], + stats: dict[str, Any], nb_parallel: int = -1, force: bool = False, ): @@ -212,7 +212,7 @@ def zonal_stats_band( vector_path, raster_path: Path, tmp_dir: Path, - stats: list[str], + stats: dict[str, Any], include_cols: list[str], ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: # Get the image info @@ -233,7 +233,7 @@ def zonal_stats_band( stats_df = exactextract.exact_extract( rast=raster_path, vec=vector_proj_path, - ops=stats, + ops=get_ops(stats), include_geom=False, output="pandas", include_cols=include_cols, @@ -251,7 +251,7 @@ def zonal_stats_band_tofile( output_paths: dict[str, Path], bands: list[str], tmp_dir: Path, - stats: list[str], + stats: dict[str, Any], include_cols: list[str], force: bool = False, ) -> dict[str, Path]: @@ -275,10 +275,14 @@ def zonal_stats_band_tofile( for band in bands: index = raster_info.bands[band].band_index band_columns = include_cols.copy() - band_columns.extend([f"band_{index}_{stat}" for stat in stats]) + band_columns.extend( + [f"band_{index}_{stat}" for stat in stats["spatial_aggregations"]] + ) band_stats_df = stats_df[band_columns].copy() band_stats_df.rename( - columns={f"band_{index}_{stat}": stat for stat in stats}, + columns={ + f"band_{index}_{stat}": stat for stat in stats["spatial_aggregations"] + }, inplace=True, ) # Add fid column to the beginning of the dataframe @@ -289,6 +293,32 @@ def zonal_stats_band_tofile( f"Write data for {len(band_stats_df.index)} parcels found to {output_paths[band]}" # noqa: E501 ) if not output_paths[band].exists(): - pdh.to_file(band_stats_df, output_paths[band], index=False) + # Write the info table to the output file + pdh.to_file(df=band_stats_df, path=output_paths[band], index=False) + + # Write the parameters table to the output file + spatial_aggregation_args = [] + for key, value in stats["spatial_aggregation_args"].items(): + spatial_aggregation_args.append([key, value]) + spatial_aggregation_args_df = pd.DataFrame( + data=spatial_aggregation_args, columns=["ops", "value"] + ) + pdh.to_file( + df=spatial_aggregation_args_df, + path=output_paths[band], + table_name="params", + index=False, + append=True, + ) return output_paths + + +def get_ops(stats: dict[str, Any]) -> list[str]: + spatial_aggregation_args = stats["spatial_aggregation_args"] + ops_params_list = [ + f"{y}={spatial_aggregation_args[y]}" for y in spatial_aggregation_args + ] + ops_params_string = ",".join(ops_params_list) + + return [f"{x}({ops_params_string})" for x in stats["spatial_aggregations"]] diff --git a/tests/test_zonal_stats_bulk.py b/tests/test_zonal_stats_bulk.py index 530bbde..cc651aa 100644 --- a/tests/test_zonal_stats_bulk.py +++ b/tests/test_zonal_stats_bulk.py @@ -3,8 +3,12 @@ import geofileops as gfo import pytest +from cropclassification.helpers import config_helper as conf from cropclassification.helpers import pandas_helper as pdh from cropclassification.util import zonal_stats_bulk +from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_exactextract import ( + get_ops, +) from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_pyqgis import HAS_QGIS from tests.test_helper import SampleData @@ -19,6 +23,16 @@ def test_zonal_stats_bulk(tmp_path, engine): test_dir = tmp_path / sample_dir.name shutil.copytree(sample_dir, test_dir) + # Read the configuration + config_paths = [ + SampleData.config_dir / "cropgroup.ini", + SampleData.tasks_dir / "local_overrule.ini", + ] + conf.read_config( + config_paths=config_paths, + default_basedir=SampleData.marker_basedir, + ) + # Make sure the s2-agri input file was copied test_image_roi_dir = test_dir / SampleData.image_dir.name / SampleData.roi_name test_s1_asc_dir = test_image_roi_dir / SampleData.image_s1_asc_path.parent.name @@ -46,3 +60,26 @@ def test_zonal_stats_bulk(tmp_path, engine): assert len(result_df) == vector_info.featurecount # The calculates stats should not be nan for any row. assert not any(result_df["mean"].isna()) + + +def test_spatial_aggregations(): + # Read the configuration + config_paths = [ + SampleData.config_dir / "cropgroup.ini", + SampleData.tasks_dir / "local_overrule.ini", + ] + conf.read_config( + config_paths=config_paths, + default_basedir=SampleData.marker_basedir, + ) + spatial_aggregations = conf.timeseries.getlist("spatial_aggregations") + spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args") + + ops = get_ops( + { + "spatial_aggregations": spatial_aggregations, + "spatial_aggregation_args": spatial_aggregation_args, + } + ) + + assert len(ops) == 6 From cd91a8ada92aec60236e14190789f8442bd6c116 Mon Sep 17 00:00:00 2001 From: KriWay Date: Mon, 14 Oct 2024 08:53:45 +0200 Subject: [PATCH 2/2] refactor aggregation parameters --- cropclassification/general.ini | 28 --------- .../preprocess/_timeseries_calc_openeo.py | 2 +- .../util/zonal_stats_bulk/__init__.py | 9 +-- .../_zonal_stats_bulk_exactextract.py | 30 +++------- tests/test_zonal_stats_bulk.py | 60 +++++++++++-------- 5 files changed, 46 insertions(+), 83 deletions(-) diff --git a/cropclassification/general.ini b/cropclassification/general.ini index 99bc4cd..2ba6299 100644 --- a/cropclassification/general.ini +++ b/cropclassification/general.ini @@ -147,34 +147,6 @@ max_cloudcover_pct = 15 # The min percentage of parcels that need to have valid data for a time+sensor to use it min_parcels_with_data_pct = 80 -# spatial_aggregations to use in zonal statistics calcuation. -spatial_aggregations = count, mean, median, std, min, max - -# spatial_aggregation arguments to use with exactextract. -# https://isciences.github.io/exactextract/operations.html -# Possible values are: -# - coverage_weight: specifies the value to use for in the above formulas. -# The following methods are available: -# - fraction: the default method, with ranging from 0 to 1 -# - none: is always equal to 1; all pixels are given the same weight -# in the above calculations regardless of their coverage fraction -# - area_cartesian: is the fraction of the pixel -# multiplied by it x and y resolutions -# - area_spherical_m2: is the fraction of the pixel that is covered -# multiplied by a spherical approximation of the cell’s area -# in square meters -# - area_spherical_km2: is the fraction of the pixel that is covered -# multiplied by a spherical approximation of the cell’s area -# in square kilometers -# - default_value: specifies a value to be used for NODATA pixels -# instead of ignoring them -# - default_weight: specifies a weighing value to be used for NODATA pixels -# instead of ignoring them -# - min_coverage_frac: specifies the minimum fraction of the pixel (0 to 1) -# that must be covered in order for a pixel to be considered in calculations. -# Defaults to 0. -spatial_aggregation_args = {"min_coverage_frac": "0.5"} - # Configuration specific to the marker being calculated. [marker] diff --git a/cropclassification/preprocess/_timeseries_calc_openeo.py b/cropclassification/preprocess/_timeseries_calc_openeo.py index 375afcd..a78fefe 100644 --- a/cropclassification/preprocess/_timeseries_calc_openeo.py +++ b/cropclassification/preprocess/_timeseries_calc_openeo.py @@ -79,7 +79,7 @@ def calculate_periodic_timeseries( id_column=conf.columns["id"], rasters_bands=images_bands, output_dir=timeseries_periodic_dir, - stats=conf.timeseries.getlist("spatial_aggregations"), + stats=["count", "mean", "median", "std", "min", "max"], engine="pyqgis", nb_parallel=nb_parallel, ) diff --git a/cropclassification/util/zonal_stats_bulk/__init__.py b/cropclassification/util/zonal_stats_bulk/__init__.py index fc30c11..5b9aaa1 100644 --- a/cropclassification/util/zonal_stats_bulk/__init__.py +++ b/cropclassification/util/zonal_stats_bulk/__init__.py @@ -1,8 +1,6 @@ from pathlib import Path from typing import Optional, Union -from cropclassification.helpers import config_helper as conf - from ._raster_helper import * # noqa: F403 @@ -83,12 +81,7 @@ def zonal_stats( vector_path=vector_path, rasters_bands=rasters_bands, output_dir=output_dir, - stats={ - "spatial_aggregations": stats, - "spatial_aggregation_args": conf.timeseries.getdict( - "spatial_aggregation_args" - ), - }, + stats=stats, columns=[id_column], nb_parallel=nb_parallel, force=force, diff --git a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py index 614d47a..c18430d 100644 --- a/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py +++ b/cropclassification/util/zonal_stats_bulk/_zonal_stats_bulk_exactextract.py @@ -6,7 +6,7 @@ from concurrent import futures from datetime import datetime from pathlib import Path -from typing import Any, Union +from typing import Union import geopandas as gpd import pandas as pd @@ -28,7 +28,7 @@ def zonal_stats( columns: list[str], rasters_bands: list[tuple[Path, list[str]]], output_dir: Path, - stats: dict[str, Any], + stats: list[str], nb_parallel: int = -1, force: bool = False, ): @@ -212,7 +212,7 @@ def zonal_stats_band( vector_path, raster_path: Path, tmp_dir: Path, - stats: dict[str, Any], + stats: list[str], include_cols: list[str], ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: # Get the image info @@ -233,7 +233,7 @@ def zonal_stats_band( stats_df = exactextract.exact_extract( rast=raster_path, vec=vector_proj_path, - ops=get_ops(stats), + ops=stats, include_geom=False, output="pandas", include_cols=include_cols, @@ -251,7 +251,7 @@ def zonal_stats_band_tofile( output_paths: dict[str, Path], bands: list[str], tmp_dir: Path, - stats: dict[str, Any], + stats: list[str], include_cols: list[str], force: bool = False, ) -> dict[str, Path]: @@ -276,12 +276,13 @@ def zonal_stats_band_tofile( index = raster_info.bands[band].band_index band_columns = include_cols.copy() band_columns.extend( - [f"band_{index}_{stat}" for stat in stats["spatial_aggregations"]] + [f"band_{index}_{stat}" for stat in [stat.split("(")[0] for stat in stats]] ) band_stats_df = stats_df[band_columns].copy() band_stats_df.rename( columns={ - f"band_{index}_{stat}": stat for stat in stats["spatial_aggregations"] + f"band_{index}_{stat}": stat + for stat in [stat.split("(")[0] for stat in stats] }, inplace=True, ) @@ -297,11 +298,8 @@ def zonal_stats_band_tofile( pdh.to_file(df=band_stats_df, path=output_paths[band], index=False) # Write the parameters table to the output file - spatial_aggregation_args = [] - for key, value in stats["spatial_aggregation_args"].items(): - spatial_aggregation_args.append([key, value]) spatial_aggregation_args_df = pd.DataFrame( - data=spatial_aggregation_args, columns=["ops", "value"] + data=stats, columns=["stats"] ) pdh.to_file( df=spatial_aggregation_args_df, @@ -312,13 +310,3 @@ def zonal_stats_band_tofile( ) return output_paths - - -def get_ops(stats: dict[str, Any]) -> list[str]: - spatial_aggregation_args = stats["spatial_aggregation_args"] - ops_params_list = [ - f"{y}={spatial_aggregation_args[y]}" for y in spatial_aggregation_args - ] - ops_params_string = ",".join(ops_params_list) - - return [f"{x}({ops_params_string})" for x in stats["spatial_aggregations"]] diff --git a/tests/test_zonal_stats_bulk.py b/tests/test_zonal_stats_bulk.py index cc651aa..08687ca 100644 --- a/tests/test_zonal_stats_bulk.py +++ b/tests/test_zonal_stats_bulk.py @@ -6,15 +6,25 @@ from cropclassification.helpers import config_helper as conf from cropclassification.helpers import pandas_helper as pdh from cropclassification.util import zonal_stats_bulk -from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_exactextract import ( - get_ops, -) from cropclassification.util.zonal_stats_bulk._zonal_stats_bulk_pyqgis import HAS_QGIS from tests.test_helper import SampleData -@pytest.mark.parametrize("engine", ["pyqgis", "rasterstats", "exactextract"]) -def test_zonal_stats_bulk(tmp_path, engine): +@pytest.mark.parametrize( + "engine, stats", + [ + ("pyqgis", ["mean", "count"]), + ("rasterstats", ["mean", "count"]), + ( + "exactextract", + [ + "mean(min_coverage_frac=0.5,coverage_weight=none)", + "count(min_coverage_frac=0.5,coverage_weight=none)", + ], + ), + ], +) +def test_zonal_stats_bulk(tmp_path, engine, stats): if engine == "pyqgis" and not HAS_QGIS: pytest.skip("QGIS is not available on this system.") @@ -48,7 +58,7 @@ def test_zonal_stats_bulk(tmp_path, engine): id_column="UID", rasters_bands=images_bands, output_dir=tmp_path, - stats=["mean", "count"], + stats=stats, engine=engine, ) @@ -62,24 +72,24 @@ def test_zonal_stats_bulk(tmp_path, engine): assert not any(result_df["mean"].isna()) -def test_spatial_aggregations(): - # Read the configuration - config_paths = [ - SampleData.config_dir / "cropgroup.ini", - SampleData.tasks_dir / "local_overrule.ini", - ] - conf.read_config( - config_paths=config_paths, - default_basedir=SampleData.marker_basedir, - ) - spatial_aggregations = conf.timeseries.getlist("spatial_aggregations") - spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args") +# def test_spatial_aggregations(): +# # Read the configuration +# config_paths = [ +# SampleData.config_dir / "cropgroup.ini", +# SampleData.tasks_dir / "local_overrule.ini", +# ] +# conf.read_config( +# config_paths=config_paths, +# default_basedir=SampleData.marker_basedir, +# ) +# spatial_aggregations = conf.timeseries.getlist("spatial_aggregations") +# spatial_aggregation_args = conf.timeseries.getdict("spatial_aggregation_args") - ops = get_ops( - { - "spatial_aggregations": spatial_aggregations, - "spatial_aggregation_args": spatial_aggregation_args, - } - ) +# ops = get_ops( +# { +# "spatial_aggregations": spatial_aggregations, +# "spatial_aggregation_args": spatial_aggregation_args, +# } +# ) - assert len(ops) == 6 +# assert len(ops) == 6