From c5d564fdff39c35d899d9c1c3efcf6b586bfdbd4 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Fri, 23 Aug 2024 17:12:35 +0100 Subject: [PATCH 01/18] implement compute_speed and compute_path_length functions --- movement/kinematics.py | 130 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/movement/kinematics.py b/movement/kinematics.py index a6e0b0b9..e4f91c1a 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -173,6 +173,31 @@ def compute_time_derivative(data: xr.DataArray, order: int) -> xr.DataArray: return result +def compute_speed(data: xr.DataArray) -> xr.DataArray: + """Compute instantaneous speed at each time point. + + Speed is a scalar quantity computed as the Euclidean norm (magnitude) + of the velocity vector at each time point. + + + Parameters + ---------- + data : xarray.DataArray + The input data containing position information in Cartesian + coordinates, with ``time`` and ``space`` as dimensions. + + Returns + ------- + xarray.DataArray + An xarray DataArray containing the computed speed. Will have + the same dimensions as the input data, except for ``space`` + which will be removed. + + """ + _validate_type_data_array(data) + return compute_norm(compute_velocity(data)) + + def compute_forward_vector( data: xr.DataArray, left_keypoint: str, @@ -675,3 +700,108 @@ def _validate_type_data_array(data: xr.DataArray) -> None: TypeError, f"Input data must be an xarray.DataArray, but got {type(data)}.", ) + + +def compute_path_length( + data: xr.DataArray, + start: float | None = None, + stop: float | None = None, +) -> xr.DataArray: + """Compute the length of a path travelled between two time points. + + The path length is defined as the sum of the norms (magnitudes) of the + displacement vectors between two time points ``start`` and ``stop``, + which should be provided in the time units of the data array. + If not specified, the minimum and maximum time points in the data array + are used as start and stop times, respectively. + + Parameters + ---------- + data : xarray.DataArray + The input data containing position information in Cartesian + coordinates, with ``time`` and ``space`` as dimensions. + start : float, optional + The time point to consider as the start of a path. + If None (default), the minimum time point in the data is used. + stop : float, optional + The time point to consider as the end of a path. + If None (default), the maximum time point in the data is used. + + Returns + ------- + xarray.DataArray + An xarray DataArray containing the computed path length. + Will have the same dimensions as the input data, except for ``time`` + and ``space`` which will be removed. + + """ + # Validate input data type and dimensions + _validate_type_data_array(data) + # We choose to validate the time dimension here, despite the fact that + # it will be also validated later in the compute_displacement function. + # This is because we rely on the time dimension for start and stop values. + validate_dims_coords(data, {"time": []}) + # Now validate the start and stop times + _validate_start_stop_times(data, start, stop) + + # Handle the case where the start or stop times are not provided + start = data.time.min() if start is None else start + stop = data.time.max() if stop is None else stop + + # Compute the sum of the displacement norms in the given time range + selected_data = data.sel(time=slice(start, stop)) + displacement_norm = compute_norm(compute_displacement(selected_data)) + return displacement_norm.sum(dim="time") + + +def _validate_start_stop_times( + data: xr.DataArray, + start: int | float | None, + stop: int | float | None, +) -> None: + """Validate the start and stop times for path length computation. + + Parameters + ---------- + data: xarray.DataArray + The input data array containing position information. + start : float or None + The start time point for path length computation. + stop : float or None + The stop time point for path length computation. + + Raises + ------ + TypeError + If the start or stop time is not numeric. + ValueError + If either of the provided times is outside the time range of the data, + or if the start time is later than the stop time. + + """ + provided_time_points = {"start time": start, "stop time": stop} + expected_time_range = (data.time.min(), data.time.max()) + + for name, value in provided_time_points.items(): + if value is None: # Skip if the time point is not provided + continue + # Check that the provided value is numeric + if not isinstance(value, int | float): + raise log_error( + TypeError, + f"Expected a numeric value for {name}, but got {type(value)}.", + ) + # Check that the provided value is within the time range of the data + if value < expected_time_range[0] or value > expected_time_range[1]: + raise log_error( + ValueError, + f"The provided {name} {value} is outside the time range " + f"of the data array ({expected_time_range}).", + ) + + # Check that the start time is earlier than the stop time + if start is not None and stop is not None and start >= stop: + raise log_error( + ValueError, + "The start time must be earlier than the stop time.", + ) From e2acd6a3ad01a09eec6305a04c0983f5e106cea7 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Tue, 8 Oct 2024 16:26:24 +0100 Subject: [PATCH 02/18] added speed to existing kinematics unit test --- movement/kinematics.py | 1 - tests/test_unit/test_kinematics.py | 33 ++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index e4f91c1a..f826a274 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -194,7 +194,6 @@ def compute_speed(data: xr.DataArray) -> xr.DataArray: which will be removed. """ - _validate_type_data_array(data) return compute_norm(compute_velocity(data)) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 18cfa2db..622cc164 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -43,6 +43,13 @@ np.zeros((10, 2)), # Individual 1 ], ), + ( + "speed", # magnitude of velocity + [ + np.ones(10) * np.sqrt(2), # Individual 0 + np.ones(10) * np.sqrt(2), # Individual 1 + ], + ), ], ) def test_kinematics_uniform_linear_motion( @@ -74,19 +81,24 @@ def test_kinematics_uniform_linear_motion( position ) + # Figure out which dimensions to expect in kinematic_array + # and in the final xarray.DataArray + expected_dims = ["time", "individuals"] + if kinematic_variable in ["displacement", "velocity", "acceleration"]: + expected_dims.append("space") + # Build expected data array from the expected numpy array expected_array = xr.DataArray( - np.stack(expected_kinematics, axis=1), # Stack along the "individuals" axis - dims=["time", "individuals", "space"], + np.stack(expected_kinematics, axis=1), + dims=expected_dims, ) if "keypoints" in position.coords: expected_array = expected_array.expand_dims( {"keypoints": position.coords["keypoints"].size} ) - expected_array = expected_array.transpose( - "time", "individuals", "keypoints", "space" - ) + expected_dims.insert(2, "keypoints") + expected_array = expected_array.transpose(*expected_dims) # Compare the values of the kinematic_array against the expected_array np.testing.assert_allclose(kinematic_array.values, expected_array.values) @@ -105,6 +117,7 @@ def test_kinematics_uniform_linear_motion( ("displacement", [5, 0]), # individual 0, individual 1 ("velocity", [6, 0]), ("acceleration", [7, 0]), + ("speed", [6, 0]), ], ) def test_kinematics_with_dataset_with_nans( @@ -134,10 +147,13 @@ def test_kinematics_with_dataset_with_nans( ] # expected nans per individual adjusted for space and keypoints dimensions + if "space" in kinematic_array.dims: + n_space_dims = position.sizes["space"] + else: + n_space_dims = 1 + expected_nans_adjusted = [ - n - * valid_dataset.sizes["space"] - * valid_dataset.sizes.get("keypoints", 1) + n * n_space_dims * valid_dataset.sizes.get("keypoints", 1) for n in expected_nans_per_individual ] # check number of nans per individual is as expected in kinematic array @@ -163,6 +179,7 @@ def test_kinematics_with_dataset_with_nans( "displacement", "velocity", "acceleration", + "speed", ], ) def test_kinematics_with_invalid_dataset( From 6549ada1633c4df67e5b8506014fffd0dcb44d76 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Wed, 9 Oct 2024 19:14:09 +0100 Subject: [PATCH 03/18] rewrote compute_path_length with various nan policies --- movement/kinematics.py | 96 +++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index f826a274..3a58dc28 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -7,7 +7,7 @@ import xarray as xr from scipy.spatial.distance import cdist -from movement.utils.logging import log_error +from movement.utils.logging import log_error, log_warning from movement.utils.vector import compute_norm from movement.validators.arrays import validate_dims_coords @@ -705,26 +705,44 @@ def compute_path_length( data: xr.DataArray, start: float | None = None, stop: float | None = None, + nan_policy: Literal["drop", "scale"] = "drop", + nan_warn_threshold: float = 0.2, ) -> xr.DataArray: """Compute the length of a path travelled between two time points. The path length is defined as the sum of the norms (magnitudes) of the displacement vectors between two time points ``start`` and ``stop``, which should be provided in the time units of the data array. - If not specified, the minimum and maximum time points in the data array - are used as start and stop times, respectively. + If not specified, the minimum and maximum time coordinates of the data + array are used as start and stop times, respectively. Parameters ---------- data : xarray.DataArray The input data containing position information in Cartesian - coordinates, with ``time`` and ``space`` as dimensions. + coordinates, with ``time`` and ``space`` among the dimensions. start : float, optional - The time point to consider as the start of a path. - If None (default), the minimum time point in the data is used. + The time to consider as the path's starting point. If None (default), + the minimum time coordinate in the data is used. stop : float, optional - The time point to consider as the end of a path. - If None (default), the maximum time point in the data is used. + The time to consider as the path's end point. If None (default), + the maximum time coordinate in the data is used. + nan_policy : str, optional + Policy to handle NaN (missing) values. Can be one of the following: + - ``"drop"``: drop any NaN values before computing path length. This + is the default behavior, and it equates to assuming that a track + follows a straight line between two valid points flanking a missing + segment. This approach tends to underestimate the path length, + and the error increases with the number of missing values. + - ``"scale"``: scale path length based on the proportion of valid + values per point track. For example, if only 80% of the values are + present, the path length will be computed based on these values, + and the result will be multiplied by 1/0.8 = 1.25. This approach + assumes that the point's dynamics are similar across present + and missing time segments, which may not be the case. + nan_warn_threshold : float, optional + If more than this proportion of values are missing in any point track, + a warning will be emitted. Defaults to 0.2 (20%). Returns ------- @@ -734,23 +752,59 @@ def compute_path_length( and ``space`` which will be removed. """ - # Validate input data type and dimensions - _validate_type_data_array(data) - # We choose to validate the time dimension here, despite the fact that - # it will be also validated later in the compute_displacement function. - # This is because we rely on the time dimension for start and stop values. + # We validate the time dimension here, on top of its later validation + # inside compute_displacement, because we rely on it for start/stop times. validate_dims_coords(data, {"time": []}) - # Now validate the start and stop times _validate_start_stop_times(data, start, stop) - # Handle the case where the start or stop times are not provided - start = data.time.min() if start is None else start - stop = data.time.max() if stop is None else stop + # Select data within the specified time range + data = data.sel(time=slice(start, stop)) + + # Emit a warning for point tracks with many missing values + nan_counts = data.isnull().any(dim="space").sum(dim="time") + dims_to_stack = [dim for dim in data.dims if dim not in ["time", "space"]] + # Stack individual and keypoints dims into a single 'tracks' dimension + stacked_nan_counts = nan_counts.stack(tracks=dims_to_stack) + tracks_with_warning = stacked_nan_counts.where( + stacked_nan_counts > nan_warn_threshold, drop=True + ).tracks.values + if len(tracks_with_warning) > 0: + log_warning( + "The following point tracks have more than " + f"{nan_warn_threshold * 100}% missing values, which may lead to " + "unreliable path length estimates: " + f"{', '.join(tracks_with_warning)}." + ) + + if nan_policy == "drop": + stacked_data = data.stack(tracks=dims_to_stack) + # Create an empty data array to hold the path length for each track + stacked_path_length = xr.zeros_like(stacked_nan_counts) + # Compute path length for each track + for track_name in stacked_data.tracks: + track_data = stacked_data.sel(tracks=track_name, drop=True).dropna( + dim="time", how="any" + ) + stacked_path_length.loc[track_name] = compute_norm( + compute_displacement(track_data) + ).sum(dim="time") + # Return the unstacked path length (restore individual and keypoints) + return stacked_path_length.unstack("tracks") + + elif nan_policy == "scale": + valid_path_length = compute_norm(compute_displacement(data)).sum( + dim="time", + skipna=True, # path length only for valid points + ) + scale_factor = 1 / (1 - nan_counts / data.sizes["time"]) + return valid_path_length * scale_factor - # Compute the sum of the displacement norms in the given time range - selected_data = data.sel(time=slice(start, stop)) - displacement_norm = compute_norm(compute_displacement(selected_data)) - return displacement_norm.sum(dim="time") + else: + raise log_error( + ValueError, + f"Invalid value for nan_policy: {nan_policy}. " + "Must be one of 'drop' or 'weight'.", + ) def _validate_start_stop_times( From db08b033eeda33a41f71d5b3033ce73be8cd689d Mon Sep 17 00:00:00 2001 From: niksirbi Date: Wed, 9 Oct 2024 19:20:57 +0100 Subject: [PATCH 04/18] unit test compute_path_length across time ranges --- tests/test_unit/test_kinematics.py | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 622cc164..0dd910b3 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -1,4 +1,5 @@ import re +from contextlib import nullcontext as does_not_raise import numpy as np import pytest @@ -203,6 +204,59 @@ def test_approximate_derivative_with_invalid_order(order): kinematics.compute_time_derivative(data, order=order) +@pytest.mark.parametrize( + "start, stop, expected_exception", + [ + # full time ranges + (None, None, does_not_raise()), + (0, None, does_not_raise()), + (None, 9, does_not_raise()), + (0, 9, does_not_raise()), + # partial time ranges + (1, 8, does_not_raise()), + (1.5, 8.5, does_not_raise()), + (2, None, does_not_raise()), + (None, 6.3, does_not_raise()), + # invalid time ranges + (0, 10, pytest.raises(ValueError)), # stop > n_frames + (-1, 9, pytest.raises(ValueError)), # start < 0 + (9, 0, pytest.raises(ValueError)), # start > stop + ("text", 9, pytest.raises(TypeError)), # start is not a number + (0, [0, 1], pytest.raises(TypeError)), # stop is not a number + ], +) +def test_compute_path_length_across_time_ranges( + valid_poses_dataset_uniform_linear_motion, + start, + stop, + expected_exception, +): + """Test that the path length is computed correctly for a uniform linear + motion case. + """ + position = valid_poses_dataset_uniform_linear_motion.position + with expected_exception: + path_length = kinematics.compute_path_length( + position, start=start, stop=stop, nan_policy="scale" + ) + # Expected number of steps (displacements) in selected time range + num_steps = 9 # full time range: 10 frames - 1 + if start is not None: + num_steps -= np.ceil(start) + if stop is not None: + num_steps -= 9 - np.floor(stop) + # Each step has a magnitude of sqrt(2) in x-y space + expected_path_length = xr.DataArray( + np.ones((2, 3)) * np.sqrt(2) * num_steps, + dims=["individuals", "keypoints"], + coords={ + "individuals": position.coords["individuals"], + "keypoints": position.coords["keypoints"], + }, + ) + xr.testing.assert_allclose(path_length, expected_path_length) + + @pytest.fixture def valid_data_array_for_forward_vector(): """Return a position data array for an individual with 3 keypoints From dff35e33e83aff5e60f0b90c2ccc54de3b631c7d Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 10 Oct 2024 16:44:09 +0100 Subject: [PATCH 05/18] fixed and refactor compute_path_length and its tests --- movement/kinematics.py | 187 +++++++++++++++++++-------- pyproject.toml | 1 + tests/conftest.py | 34 +++++ tests/test_unit/test_kinematics.py | 194 ++++++++++++++++++++++++++--- 4 files changed, 352 insertions(+), 64 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index 3a58dc28..e8dc1dc2 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -8,6 +8,7 @@ from scipy.spatial.distance import cdist from movement.utils.logging import log_error, log_warning +from movement.utils.reports import report_nan_values from movement.utils.vector import compute_norm from movement.validators.arrays import validate_dims_coords @@ -732,13 +733,15 @@ def compute_path_length( - ``"drop"``: drop any NaN values before computing path length. This is the default behavior, and it equates to assuming that a track follows a straight line between two valid points flanking a missing - segment. This approach tends to underestimate the path length, - and the error increases with the number of missing values. + segment. Missing segments at the beginning or end of the specified + time range are not counted. This approach tends to underestimate + the path length, and the error increases with the number of missing + values. - ``"scale"``: scale path length based on the proportion of valid - values per point track. For example, if only 80% of the values are - present, the path length will be computed based on these values, - and the result will be multiplied by 1/0.8 = 1.25. This approach - assumes that the point's dynamics are similar across present + segments per point track. For example, if only 80% of segments + are present, the path length will be computed based on these + and the result will be divided by 0.8. This approach + assumes that motion dynamics are similar across present and missing time segments, which may not be the case. nan_warn_threshold : float, optional If more than this proportion of values are missing in any point track, @@ -752,53 +755,15 @@ def compute_path_length( and ``space`` which will be removed. """ - # We validate the time dimension here, on top of its later validation - # inside compute_displacement, because we rely on it for start/stop times. - validate_dims_coords(data, {"time": []}) _validate_start_stop_times(data, start, stop) - - # Select data within the specified time range data = data.sel(time=slice(start, stop)) - - # Emit a warning for point tracks with many missing values - nan_counts = data.isnull().any(dim="space").sum(dim="time") - dims_to_stack = [dim for dim in data.dims if dim not in ["time", "space"]] - # Stack individual and keypoints dims into a single 'tracks' dimension - stacked_nan_counts = nan_counts.stack(tracks=dims_to_stack) - tracks_with_warning = stacked_nan_counts.where( - stacked_nan_counts > nan_warn_threshold, drop=True - ).tracks.values - if len(tracks_with_warning) > 0: - log_warning( - "The following point tracks have more than " - f"{nan_warn_threshold * 100}% missing values, which may lead to " - "unreliable path length estimates: " - f"{', '.join(tracks_with_warning)}." - ) + _warn_about_nan_proportion(data, nan_warn_threshold) if nan_policy == "drop": - stacked_data = data.stack(tracks=dims_to_stack) - # Create an empty data array to hold the path length for each track - stacked_path_length = xr.zeros_like(stacked_nan_counts) - # Compute path length for each track - for track_name in stacked_data.tracks: - track_data = stacked_data.sel(tracks=track_name, drop=True).dropna( - dim="time", how="any" - ) - stacked_path_length.loc[track_name] = compute_norm( - compute_displacement(track_data) - ).sum(dim="time") - # Return the unstacked path length (restore individual and keypoints) - return stacked_path_length.unstack("tracks") + return _compute_path_length_drop_nan(data) elif nan_policy == "scale": - valid_path_length = compute_norm(compute_displacement(data)).sum( - dim="time", - skipna=True, # path length only for valid points - ) - scale_factor = 1 / (1 - nan_counts / data.sizes["time"]) - return valid_path_length * scale_factor - + return _compute_scaled_path_length(data) else: raise log_error( ValueError, @@ -828,10 +793,15 @@ def _validate_start_stop_times( TypeError If the start or stop time is not numeric. ValueError - If either of the provided times is outside the time range of the data, - or if the start time is later than the stop time. + If the time dimension is missing, if either of the provided times is + outside the data time range, or if the start time is later than the + stop time. """ + # We validate the time dimension here, on top of any validation that may + # occur downstream, because we rely on it for start/stop times. + validate_dims_coords(data, {"time": []}) + provided_time_points = {"start time": start, "stop time": stop} expected_time_range = (data.time.min(), data.time.max()) @@ -858,3 +828,122 @@ def _validate_start_stop_times( ValueError, "The start time must be earlier than the stop time.", ) + + +def _warn_about_nan_proportion( + data: xr.DataArray, nan_warn_threshold: float +) -> None: + """Print a warning if the proportion of NaN values exceeds a threshold. + + The NaN proportion is evaluated per point track, and a given point is + considered NaN if any of its ``space`` coordinates are NaN. The warning + specifically lists the point tracks that exceed the threshold. + + Parameters + ---------- + data : xarray.DataArray + The input data array. + nan_warn_threshold : float + The threshold for the proportion of NaN values. Must be a number + between 0 and 1. + + """ + nan_warn_threshold = float(nan_warn_threshold) + if nan_warn_threshold < 0 or nan_warn_threshold > 1: + raise log_error( + ValueError, + "nan_warn_threshold must be a number between 0 and 1.", + ) + + n_nans = data.isnull().any(dim="space").sum(dim="time") + data_to_warn_about = data.where( + n_nans > data.sizes["time"] * nan_warn_threshold, drop=True + ) + if len(data_to_warn_about) > 0: + log_warning( + "The result may be unreliable for point tracks with many " + "missing values. The following tracks have more than " + f"{nan_warn_threshold * 100:.3} %) NaN values:", + ) + report_nan_values(data_to_warn_about) + + +def _compute_scaled_path_length( + data: xr.DataArray, +) -> xr.DataArray: + """Compute scaled path length based on proportion of valid segments. + + Path length is first computed based on valid segments (non-NaN values + on both ends of the segment) and then scaled based on the proportion of + valid segments per point track - i.e. the result is divided by the + proportion of valid segments. + + Parameters + ---------- + data : xarray.DataArray + The input data containing position information in Cartesian + coordinates, with ``time`` and ``space`` among the dimensions. + + Returns + ------- + xarray.DataArray + An xarray DataArray containing the computed path length. + Will have the same dimensions as the input data, except for ``time`` + and ``space`` which will be removed. + + """ + # Skip first displacement segment (always 0) to not mess up the scaling + displacement = compute_displacement(data).isel(time=slice(1, None)) + # count number of valid displacement segments per point track + valid_segments = (~displacement.isnull()).any(dim="space").sum(dim="time") + # compute proportion of valid segments per point track + valid_proportion = valid_segments / (data.sizes["time"] - 1) + # return scaled path length + return compute_norm(displacement).sum(dim="time") / valid_proportion + + +def _compute_path_length_drop_nan( + data: xr.DataArray, +) -> xr.DataArray: + """Compute path length by dropping NaN values before computation. + + This function iterates over point tracks, drops NaN values from each + track, and then computes the path length for the remaining valid + segments (takes the sum of the norms of the displacement vectors). + If there is no valid segment in a track, the path length for that + track will be NaN. + + Parameters + ---------- + data : xarray.DataArray + The input data containing position information in Cartesian + coordinates, with ``time`` and ``space`` among the dimensions. + + Returns + ------- + xarray.DataArray + An xarray DataArray containing the computed path length. + Will have the same dimensions as the input data, except for ``time`` + and ``space`` which will be removed. + + """ + # Create array for holding results + path_length = xr.full_like( + data.isel(time=0, space=0, drop=True), fill_value=np.nan + ) + + # Stack data to iterate over point tracks + dims_to_stack = [d for d in data.dims if d not in ["time", "space"]] + stacked_data = data.stack(tracks=dims_to_stack) + for track_name in stacked_data.tracks.values: + # Drop NaN values from current point track + track_data = stacked_data.sel(tracks=track_name, drop=True).dropna( + dim="time", how="any" + ) + # Compute path length for current point track + # and store it in the result array + target_loc = {k: v for k, v in zip(dims_to_stack, track_name)} + path_length.loc[target_loc] = compute_norm( + compute_displacement(track_data) + ).sum(dim="time", min_count=1) # returns NaN if no valid segment + return path_length diff --git a/pyproject.toml b/pyproject.toml index 27348c29..2d7b1aa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,6 +105,7 @@ fix = true ignore = [ "D203", # one blank line before class "D213", # multi-line-summary second line + "B905", # zip without explicit strict ] select = [ "E", # pycodestyle errors diff --git a/tests/conftest.py b/tests/conftest.py index 4de80c31..2a78e3a7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -518,6 +518,40 @@ def valid_poses_dataset_uniform_linear_motion( ) +@pytest.fixture +def valid_poses_dataset_uniform_linear_motion_with_nans( + valid_poses_dataset_uniform_linear_motion, +): + """Return a valid poses dataset with NaN values in the position array. + + Specifically, we will introducde: + - 1 NaN value in the centroid keypoint of individual id_1 at time=0 + - 5 NaN values in the left keypoint of individual id_1 (frames 3-7) + - 10 NaN values in the right keypoint of individual id_1 (all frames) + """ + valid_poses_dataset_uniform_linear_motion.position.loc[ + { + "individuals": "id_1", + "keypoints": "centroid", + "time": 0, + } + ] = np.nan + valid_poses_dataset_uniform_linear_motion.position.loc[ + { + "individuals": "id_1", + "keypoints": "left", + "time": slice(3, 7), + } + ] = np.nan + valid_poses_dataset_uniform_linear_motion.position.loc[ + { + "individuals": "id_1", + "keypoints": "right", + } + ] = np.nan + return valid_poses_dataset_uniform_linear_motion + + # -------------------- Invalid datasets fixtures ------------------------------ @pytest.fixture def not_a_dataset(): diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 0dd910b3..fdf1104b 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -218,36 +218,81 @@ def test_approximate_derivative_with_invalid_order(order): (2, None, does_not_raise()), (None, 6.3, does_not_raise()), # invalid time ranges - (0, 10, pytest.raises(ValueError)), # stop > n_frames - (-1, 9, pytest.raises(ValueError)), # start < 0 - (9, 0, pytest.raises(ValueError)), # start > stop - ("text", 9, pytest.raises(TypeError)), # start is not a number - (0, [0, 1], pytest.raises(TypeError)), # stop is not a number + ( + 0, + 10, + pytest.raises( + ValueError, match="stop time 10 is outside the time range" + ), + ), + ( + -1, + 9, + pytest.raises( + ValueError, match="start time -1 is outside the time range" + ), + ), + ( + 9, + 0, + pytest.raises( + ValueError, + match="start time must be earlier than the stop time", + ), + ), + ( + "text", + 9, + pytest.raises( + TypeError, match="Expected a numeric value for start" + ), + ), + ( + 0, + [0, 1], + pytest.raises( + TypeError, match="Expected a numeric value for stop" + ), + ), ], ) -def test_compute_path_length_across_time_ranges( +@pytest.mark.parametrize( + "nan_policy", + ["drop", "scale"], # results should be same for both here +) +def test_path_length_across_time_ranges( valid_poses_dataset_uniform_linear_motion, start, stop, + nan_policy, expected_exception, ): - """Test that the path length is computed correctly for a uniform linear - motion case. + """Test path length computation for a uniform linear motion case, + across different time ranges. + + The test dataset ``valid_poses_dataset_uniform_linear_motion`` + contains 2 individuals ("id_0" and "id_1"), moving + along x=y and x=-y lines, respectively, at a constant velocity. + At each frame they cover a distance of sqrt(2) in x-y space, so in total + we expect a path length of sqrt(2) * num_segments, where num_segments is + the number of selected frames minus 1. """ position = valid_poses_dataset_uniform_linear_motion.position with expected_exception: path_length = kinematics.compute_path_length( - position, start=start, stop=stop, nan_policy="scale" + position, start=start, stop=stop, nan_policy=nan_policy ) - # Expected number of steps (displacements) in selected time range - num_steps = 9 # full time range: 10 frames - 1 + + # Expected number of segments (displacements) in selected time range + num_segments = 9 # full time range: 10 frames - 1 if start is not None: - num_steps -= np.ceil(start) + num_segments -= np.ceil(start) if stop is not None: - num_steps -= 9 - np.floor(stop) - # Each step has a magnitude of sqrt(2) in x-y space + num_segments -= 9 - np.floor(stop) + print("num_segments", num_segments) + expected_path_length = xr.DataArray( - np.ones((2, 3)) * np.sqrt(2) * num_steps, + np.ones((2, 3)) * np.sqrt(2) * num_segments, dims=["individuals", "keypoints"], coords={ "individuals": position.coords["individuals"], @@ -257,6 +302,125 @@ def test_compute_path_length_across_time_ranges( xr.testing.assert_allclose(path_length, expected_path_length) +@pytest.mark.parametrize( + "nan_policy, expected_path_lengths_id_1, expected_exception", + [ + ( + "drop", + { + # 9 segments - 1 missing on edge + "centroid": np.sqrt(2) * 8, + # missing mid frames should have no effect + "left": np.sqrt(2) * 9, + "right": np.nan, # all frames missing + }, + does_not_raise(), + ), + ( + "scale", + { + # scaling should restore "true" path length + "centroid": np.sqrt(2) * 9, + "left": np.sqrt(2) * 9, + "right": np.nan, # all frames missing + }, + does_not_raise(), + ), + ( + "invalid", # invalid value for nan_policy + {}, + pytest.raises(ValueError, match="Invalid value for nan_policy"), + ), + ], +) +def test_path_length_with_nans( + valid_poses_dataset_uniform_linear_motion_with_nans, + nan_policy, + expected_path_lengths_id_1, + expected_exception, +): + """Test path length computation for a uniform linear motion case, + with varying number of missing values per individual and keypoint. + + The test dataset ``valid_poses_dataset_uniform_linear_motion_with_nans`` + contains 2 individuals ("id_0" and "id_1"), moving + along x=y and x=-y lines, respectively, at a constant velocity. + At each frame they cover a distance of sqrt(2) in x-y space. + + Individual "id_1" has some missing values per keypoint: + - "centroid" is missing a value on the very first frame + - "left" is missing 5 values in middle frames (not at the edges) + - "right" is missing values in all frames + + Individual "id_0" has no missing values. + + Because the underlying motion is uniform linear, the "scale" policy should + perfectly restore the path length for individual "id_1" to its true value. + The "drop" policy should do likewise if frames are missing in the middle, + but will not count any missing frames at the edges. + """ + position = valid_poses_dataset_uniform_linear_motion_with_nans.position + with expected_exception: + path_length = kinematics.compute_path_length( + position, + nan_policy=nan_policy, + ) + # Initialise with expected path lengths for scenario without NaNs + expected_array = xr.DataArray( + np.ones((2, 3)) * np.sqrt(2) * 9, + dims=["individuals", "keypoints"], + coords={ + "individuals": position.coords["individuals"], + "keypoints": position.coords["keypoints"], + }, + ) + # insert expected path lengths for individual id_1 + for kpt, value in expected_path_lengths_id_1.items(): + target_loc = {"individuals": "id_1", "keypoints": kpt} + expected_array.loc[target_loc] = value + xr.testing.assert_allclose(path_length, expected_array) + + +@pytest.mark.parametrize( + "nan_warn_threshold, expected_exception", + [ + (1, does_not_raise()), + (0.2, does_not_raise()), + (-1, pytest.raises(ValueError, match="a number between 0 and 1")), + ], +) +def test_path_length_warns_about_nans( + valid_poses_dataset_uniform_linear_motion_with_nans, + nan_warn_threshold, + expected_exception, + caplog, +): + """Test that a warning is raised when the number of missing values + exceeds a given threshold. + + See the docstring of ``test_path_length_with_nans`` for details + about what's in the dataset. + """ + position = valid_poses_dataset_uniform_linear_motion_with_nans.position + with expected_exception: + kinematics.compute_path_length( + position, nan_warn_threshold=nan_warn_threshold + ) + + if (nan_warn_threshold > 0.1) and (nan_warn_threshold < 0.5): + # Make sure that a warning was emitted + assert caplog.records[0].levelname == "WARNING" + assert "The result may be unreliable" in caplog.records[0].message + # Make sure that the NaN report only mentions + # the individual and keypoint that violate the threshold + assert caplog.records[1].levelname == "INFO" + assert "Individual: id_1" in caplog.records[1].message + assert "Individual: id_2" not in caplog.records[1].message + assert "left: 5/10 (50.0%)" in caplog.records[1].message + assert "right: 10/10 (100.0%)" in caplog.records[1].message + assert "centroid" not in caplog.records[1].message + + @pytest.fixture def valid_data_array_for_forward_vector(): """Return a position data array for an individual with 3 keypoints From 7ff415b479e4cd47213b8a928e4279df06c143a3 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 10 Oct 2024 17:16:49 +0100 Subject: [PATCH 06/18] fixed docstring for compute_path_length --- movement/kinematics.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index e8dc1dc2..81954b91 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -728,21 +728,10 @@ def compute_path_length( stop : float, optional The time to consider as the path's end point. If None (default), the maximum time coordinate in the data is used. - nan_policy : str, optional - Policy to handle NaN (missing) values. Can be one of the following: - - ``"drop"``: drop any NaN values before computing path length. This - is the default behavior, and it equates to assuming that a track - follows a straight line between two valid points flanking a missing - segment. Missing segments at the beginning or end of the specified - time range are not counted. This approach tends to underestimate - the path length, and the error increases with the number of missing - values. - - ``"scale"``: scale path length based on the proportion of valid - segments per point track. For example, if only 80% of segments - are present, the path length will be computed based on these - and the result will be divided by 0.8. This approach - assumes that motion dynamics are similar across present - and missing time segments, which may not be the case. + nan_policy : Literal["drop", "scale"], optional + Policy to handle NaN (missing) values. Can be one of the ``"drop"`` + or ``"scale"``. Defaults to ``"drop"``. See Notes for more details + on the two policies. nan_warn_threshold : float, optional If more than this proportion of values are missing in any point track, a warning will be emitted. Defaults to 0.2 (20%). @@ -754,6 +743,23 @@ def compute_path_length( Will have the same dimensions as the input data, except for ``time`` and ``space`` which will be removed. + Notes + ----- + Choosing ``nan_policy="drop"`` will drop NaN values from each point track + before computing path length. This equates to assuming that a track + follows a straight line between two valid points flanking a missing + segment. Missing segments at the beginning or end of the specified + time range are not counted. This approach tends to underestimate + the path length, and the error increases with the number of missing + values. + + Choosing ``nan_policy="scale"`` will adjust the path length based on the + the proportion of valid segments per point track. For example, if only + 80% of segments are present, the path length will be computed based on + these and the result will be divided by 0.8. This approach assumes + that motion dynamics are similar across present and missing time + segments, which may not be the case. + """ _validate_start_stop_times(data, start, stop) data = data.sel(time=slice(start, stop)) From da88185cd3d8601f10eb3c120f4f617f3657ce77 Mon Sep 17 00:00:00 2001 From: Niko Sirmpilatze Date: Thu, 31 Oct 2024 09:55:46 +0000 Subject: [PATCH 07/18] Accept suggestion on docstring wording Co-authored-by: Chang Huan Lo --- movement/kinematics.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index 81954b91..2c24a471 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -184,15 +184,15 @@ def compute_speed(data: xr.DataArray) -> xr.DataArray: Parameters ---------- data : xarray.DataArray - The input data containing position information in Cartesian - coordinates, with ``time`` and ``space`` as dimensions. + The input data containing position information, with ``time`` + and ``space`` (in Cartesian coordinates) as required dimensions. Returns ------- xarray.DataArray - An xarray DataArray containing the computed speed. Will have - the same dimensions as the input data, except for ``space`` - which will be removed. + An xarray DataArray containing the computed speed, + with dimensions matching those of the input data, + except ``space`` is removed. """ return compute_norm(compute_velocity(data)) @@ -720,13 +720,13 @@ def compute_path_length( Parameters ---------- data : xarray.DataArray - The input data containing position information in Cartesian - coordinates, with ``time`` and ``space`` among the dimensions. + The input data containing position information, with ``time`` + and ``space`` (in Cartesian coordinates) as required dimensions. start : float, optional - The time to consider as the path's starting point. If None (default), + The start time of the path. If None (default), the minimum time coordinate in the data is used. stop : float, optional - The time to consider as the path's end point. If None (default), + The end time of the path. If None (default), the maximum time coordinate in the data is used. nan_policy : Literal["drop", "scale"], optional Policy to handle NaN (missing) values. Can be one of the ``"drop"`` @@ -739,15 +739,15 @@ def compute_path_length( Returns ------- xarray.DataArray - An xarray DataArray containing the computed path length. - Will have the same dimensions as the input data, except for ``time`` - and ``space`` which will be removed. + An xarray DataArray containing the computed path length, + with dimensions matching those of the input data, + except ``time`` and ``space`` are removed. Notes ----- Choosing ``nan_policy="drop"`` will drop NaN values from each point track before computing path length. This equates to assuming that a track - follows a straight line between two valid points flanking a missing + follows a straight line between two valid points adjacent to a missing segment. Missing segments at the beginning or end of the specified time range are not counted. This approach tends to underestimate the path length, and the error increases with the number of missing @@ -757,8 +757,8 @@ def compute_path_length( the proportion of valid segments per point track. For example, if only 80% of segments are present, the path length will be computed based on these and the result will be divided by 0.8. This approach assumes - that motion dynamics are similar across present and missing time - segments, which may not be the case. + that motion dynamics are similar across observed and missing time + segments, which may not accurately reflect actual conditions. """ _validate_start_stop_times(data, start, stop) @@ -774,10 +774,9 @@ def compute_path_length( raise log_error( ValueError, f"Invalid value for nan_policy: {nan_policy}. " - "Must be one of 'drop' or 'weight'.", + "Must be one of 'drop' or 'scale'.", ) - def _validate_start_stop_times( data: xr.DataArray, start: int | float | None, @@ -855,10 +854,10 @@ def _warn_about_nan_proportion( """ nan_warn_threshold = float(nan_warn_threshold) - if nan_warn_threshold < 0 or nan_warn_threshold > 1: + if not 0 <= nan_warn_threshold <= 1: raise log_error( ValueError, - "nan_warn_threshold must be a number between 0 and 1.", + "nan_warn_threshold must be between 0 and 1.", ) n_nans = data.isnull().any(dim="space").sum(dim="time") From bfad87a0f329b4e8a4fcbfea7d1f740deabf8fe4 Mon Sep 17 00:00:00 2001 From: Niko Sirmpilatze Date: Thu, 31 Oct 2024 17:12:43 +0000 Subject: [PATCH 08/18] Remove print statement from test Co-authored-by: Chang Huan Lo --- tests/test_unit/test_kinematics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index fdf1104b..6aeea30c 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -289,7 +289,6 @@ def test_path_length_across_time_ranges( num_segments -= np.ceil(start) if stop is not None: num_segments -= 9 - np.floor(stop) - print("num_segments", num_segments) expected_path_length = xr.DataArray( np.ones((2, 3)) * np.sqrt(2) * num_segments, From 4f669a9c13e7c98ed8bcb61521943c8053f74208 Mon Sep 17 00:00:00 2001 From: Niko Sirmpilatze Date: Thu, 31 Oct 2024 17:32:07 +0000 Subject: [PATCH 09/18] Ensure nan report is printed Co-authored-by: Chang Huan Lo --- movement/kinematics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index 2c24a471..a35a6369 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -870,7 +870,7 @@ def _warn_about_nan_proportion( "missing values. The following tracks have more than " f"{nan_warn_threshold * 100:.3} %) NaN values:", ) - report_nan_values(data_to_warn_about) + print(report_nan_values(data_to_warn_about)) def _compute_scaled_path_length( From 76df0f0a2564d2e1d066b442abfac5bb936ce129 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 31 Oct 2024 17:21:57 +0000 Subject: [PATCH 10/18] adapt warning message match in test --- tests/test_unit/test_kinematics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 6aeea30c..7a22b55e 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -385,7 +385,7 @@ def test_path_length_with_nans( [ (1, does_not_raise()), (0.2, does_not_raise()), - (-1, pytest.raises(ValueError, match="a number between 0 and 1")), + (-1, pytest.raises(ValueError, match="between 0 and 1")), ], ) def test_path_length_warns_about_nans( From 87df402745d800f8967c4d0d5070b039a58c4eb0 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 31 Oct 2024 17:22:31 +0000 Subject: [PATCH 11/18] change 'any' to 'all' --- movement/kinematics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index a35a6369..57dfe631 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -191,7 +191,7 @@ def compute_speed(data: xr.DataArray) -> xr.DataArray: ------- xarray.DataArray An xarray DataArray containing the computed speed, - with dimensions matching those of the input data, + with dimensions matching those of the input data, except ``space`` is removed. """ @@ -740,7 +740,7 @@ def compute_path_length( ------- xarray.DataArray An xarray DataArray containing the computed path length, - with dimensions matching those of the input data, + with dimensions matching those of the input data, except ``time`` and ``space`` are removed. Notes @@ -900,7 +900,7 @@ def _compute_scaled_path_length( # Skip first displacement segment (always 0) to not mess up the scaling displacement = compute_displacement(data).isel(time=slice(1, None)) # count number of valid displacement segments per point track - valid_segments = (~displacement.isnull()).any(dim="space").sum(dim="time") + valid_segments = (~displacement.isnull()).all(dim="space").sum(dim="time") # compute proportion of valid segments per point track valid_proportion = valid_segments / (data.sizes["time"] - 1) # return scaled path length From 9f152b1f4560fcb9872fbb2b3ce2904ff40b226b Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 31 Oct 2024 18:50:36 +0000 Subject: [PATCH 12/18] uniform wording across path length docstrings --- movement/kinematics.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index 57dfe631..cb3be779 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -886,15 +886,15 @@ def _compute_scaled_path_length( Parameters ---------- data : xarray.DataArray - The input data containing position information in Cartesian - coordinates, with ``time`` and ``space`` among the dimensions. + The input data containing position information, with ``time`` + and ``space`` (in Cartesian coordinates) as required dimensions. Returns ------- xarray.DataArray - An xarray DataArray containing the computed path length. - Will have the same dimensions as the input data, except for ``time`` - and ``space`` which will be removed. + An xarray DataArray containing the computed path length, + with dimensions matching those of the input data, + except ``time`` and ``space`` are removed. """ # Skip first displacement segment (always 0) to not mess up the scaling @@ -921,15 +921,15 @@ def _compute_path_length_drop_nan( Parameters ---------- data : xarray.DataArray - The input data containing position information in Cartesian - coordinates, with ``time`` and ``space`` among the dimensions. + The input data containing position information, with ``time`` + and ``space`` (in Cartesian coordinates) as required dimensions. Returns ------- xarray.DataArray - An xarray DataArray containing the computed path length. - Will have the same dimensions as the input data, except for ``time`` - and ``space`` which will be removed. + An xarray DataArray containing the computed path length, + with dimensions matching those of the input data, + except ``time`` and ``space`` are removed. """ # Create array for holding results From 43a389989abe68c6b2c990ff8017a16d0fe7a673 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Thu, 31 Oct 2024 19:53:44 +0000 Subject: [PATCH 13/18] (mostly) leave time range validation to xarray slice --- movement/kinematics.py | 70 +++++------------------------- tests/test_unit/test_kinematics.py | 46 ++++++++------------ 2 files changed, 28 insertions(+), 88 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index cb3be779..d40cdcb6 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -761,8 +761,17 @@ def compute_path_length( segments, which may not accurately reflect actual conditions. """ - _validate_start_stop_times(data, start, stop) + validate_dims_coords(data, {"time": [], "space": []}) data = data.sel(time=slice(start, stop)) + # Check that the data is not empty or too short + n_time = data.sizes["time"] + if n_time < 2: + raise log_error( + ValueError, + f"At least 2 time points are required to compute path length, " + f"but {n_time} were found. Double-check the start and stop times.", + ) + _warn_about_nan_proportion(data, nan_warn_threshold) if nan_policy == "drop": @@ -776,64 +785,7 @@ def compute_path_length( f"Invalid value for nan_policy: {nan_policy}. " "Must be one of 'drop' or 'scale'.", ) - -def _validate_start_stop_times( - data: xr.DataArray, - start: int | float | None, - stop: int | float | None, -) -> None: - """Validate the start and stop times for path length computation. - - Parameters - ---------- - data: xarray.DataArray - The input data array containing position information. - start : float or None - The start time point for path length computation. - stop : float or None - The stop time point for path length computation. - - Raises - ------ - TypeError - If the start or stop time is not numeric. - ValueError - If the time dimension is missing, if either of the provided times is - outside the data time range, or if the start time is later than the - stop time. - - """ - # We validate the time dimension here, on top of any validation that may - # occur downstream, because we rely on it for start/stop times. - validate_dims_coords(data, {"time": []}) - - provided_time_points = {"start time": start, "stop time": stop} - expected_time_range = (data.time.min(), data.time.max()) - - for name, value in provided_time_points.items(): - if value is None: # Skip if the time point is not provided - continue - # Check that the provided value is numeric - if not isinstance(value, int | float): - raise log_error( - TypeError, - f"Expected a numeric value for {name}, but got {type(value)}.", - ) - # Check that the provided value is within the time range of the data - if value < expected_time_range[0] or value > expected_time_range[1]: - raise log_error( - ValueError, - f"The provided {name} {value} is outside the time range " - f"of the data array ({expected_time_range}).", - ) - - # Check that the start time is earlier than the stop time - if start is not None and stop is not None and start >= stop: - raise log_error( - ValueError, - "The start time must be earlier than the stop time.", - ) - + def _warn_about_nan_proportion( data: xr.DataArray, nan_warn_threshold: float diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 7a22b55e..5377abfd 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -210,61 +210,46 @@ def test_approximate_derivative_with_invalid_order(order): # full time ranges (None, None, does_not_raise()), (0, None, does_not_raise()), - (None, 9, does_not_raise()), (0, 9, does_not_raise()), + (0, 10, does_not_raise()), # xarray.sel will truncate to 0, 9 + (-1, 9, does_not_raise()), # xarray.sel will truncate to 0, 9 # partial time ranges (1, 8, does_not_raise()), (1.5, 8.5, does_not_raise()), (2, None, does_not_raise()), - (None, 6.3, does_not_raise()), - # invalid time ranges - ( - 0, - 10, - pytest.raises( - ValueError, match="stop time 10 is outside the time range" - ), - ), - ( - -1, - 9, - pytest.raises( - ValueError, match="start time -1 is outside the time range" - ), - ), + # Empty time range (because start > stop) ( 9, 0, pytest.raises( ValueError, - match="start time must be earlier than the stop time", + match="At least 2 time points", ), ), + # Empty time range (because of invalid start type) ( "text", 9, pytest.raises( - TypeError, match="Expected a numeric value for start" + ValueError, + match="At least 2 time points", ), ), + # Time range too short ( 0, - [0, 1], + 0.5, pytest.raises( - TypeError, match="Expected a numeric value for stop" + ValueError, + match="At least 2 time points", ), ), ], ) -@pytest.mark.parametrize( - "nan_policy", - ["drop", "scale"], # results should be same for both here -) def test_path_length_across_time_ranges( valid_poses_dataset_uniform_linear_motion, start, stop, - nan_policy, expected_exception, ): """Test path length computation for a uniform linear motion case, @@ -280,15 +265,18 @@ def test_path_length_across_time_ranges( position = valid_poses_dataset_uniform_linear_motion.position with expected_exception: path_length = kinematics.compute_path_length( - position, start=start, stop=stop, nan_policy=nan_policy + position, start=start, stop=stop ) # Expected number of segments (displacements) in selected time range num_segments = 9 # full time range: 10 frames - 1 + start = max(0, start) if start is not None else 0 + stop = min(9, stop) if stop is not None else 9 if start is not None: - num_segments -= np.ceil(start) + num_segments -= np.ceil(max(0, start)) if stop is not None: - num_segments -= 9 - np.floor(stop) + stop = min(9, stop) + num_segments -= 9 - np.floor(min(9, stop)) expected_path_length = xr.DataArray( np.ones((2, 3)) * np.sqrt(2) * num_segments, From 9490e9ea85e41e5b3e87e9da420de0ed106ef16c Mon Sep 17 00:00:00 2001 From: niksirbi Date: Mon, 4 Nov 2024 09:09:38 +0100 Subject: [PATCH 14/18] refactored parameters for test across time ranges --- tests/test_unit/test_kinematics.py | 36 +++++++++--------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 5377abfd..8d401e93 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -204,6 +204,12 @@ def test_approximate_derivative_with_invalid_order(order): kinematics.compute_time_derivative(data, order=order) +time_points_value_error = pytest.raises( + ValueError, + match="At least 2 time points are required to compute path length", +) + + @pytest.mark.parametrize( "start, stop, expected_exception", [ @@ -217,33 +223,11 @@ def test_approximate_derivative_with_invalid_order(order): (1, 8, does_not_raise()), (1.5, 8.5, does_not_raise()), (2, None, does_not_raise()), - # Empty time range (because start > stop) - ( - 9, - 0, - pytest.raises( - ValueError, - match="At least 2 time points", - ), - ), - # Empty time range (because of invalid start type) - ( - "text", - 9, - pytest.raises( - ValueError, - match="At least 2 time points", - ), - ), + # Empty time ranges + (9, 0, time_points_value_error), # start > stop + ("text", 9, time_points_value_error), # invalid start type # Time range too short - ( - 0, - 0.5, - pytest.raises( - ValueError, - match="At least 2 time points", - ), - ), + (0, 0.5, time_points_value_error), ], ) def test_path_length_across_time_ranges( From 7732ef0c02c18fdf0af28eccb8b3a474fa6fcfff Mon Sep 17 00:00:00 2001 From: niksirbi Date: Mon, 4 Nov 2024 09:19:02 +0100 Subject: [PATCH 15/18] simplified test for path lenght with nans --- tests/test_unit/test_kinematics.py | 35 +++++++----------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index 8d401e93..a642bf24 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -278,28 +278,17 @@ def test_path_length_across_time_ranges( [ ( "drop", - { - # 9 segments - 1 missing on edge - "centroid": np.sqrt(2) * 8, - # missing mid frames should have no effect - "left": np.sqrt(2) * 9, - "right": np.nan, # all frames missing - }, + np.array([np.sqrt(2) * 8, np.sqrt(2) * 9, np.nan]), does_not_raise(), ), ( "scale", - { - # scaling should restore "true" path length - "centroid": np.sqrt(2) * 9, - "left": np.sqrt(2) * 9, - "right": np.nan, # all frames missing - }, + np.array([np.sqrt(2) * 9, np.sqrt(2) * 9, np.nan]), does_not_raise(), ), ( "invalid", # invalid value for nan_policy - {}, + np.zeros(3), pytest.raises(ValueError, match="Invalid value for nan_policy"), ), ], @@ -336,20 +325,12 @@ def test_path_length_with_nans( position, nan_policy=nan_policy, ) - # Initialise with expected path lengths for scenario without NaNs - expected_array = xr.DataArray( - np.ones((2, 3)) * np.sqrt(2) * 9, - dims=["individuals", "keypoints"], - coords={ - "individuals": position.coords["individuals"], - "keypoints": position.coords["keypoints"], - }, + # Get path_length for individual "id_1" as a numpy array + path_length_id_1 = path_length.sel(individuals="id_1").values + # Check them against the expected values + np.testing.assert_allclose( + path_length_id_1, expected_path_lengths_id_1 ) - # insert expected path lengths for individual id_1 - for kpt, value in expected_path_lengths_id_1.items(): - target_loc = {"individuals": "id_1", "keypoints": kpt} - expected_array.loc[target_loc] = value - xr.testing.assert_allclose(path_length, expected_array) @pytest.mark.parametrize( From e283890d20ede3935685897359cd8e1d2722aa99 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Mon, 4 Nov 2024 10:24:02 +0100 Subject: [PATCH 16/18] replace drop policy with ffill --- movement/kinematics.py | 83 +++++++----------------------- tests/test_unit/test_kinematics.py | 6 +-- 2 files changed, 23 insertions(+), 66 deletions(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index d40cdcb6..6d4c7c50 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -706,7 +706,7 @@ def compute_path_length( data: xr.DataArray, start: float | None = None, stop: float | None = None, - nan_policy: Literal["drop", "scale"] = "drop", + nan_policy: Literal["ffill", "scale"] = "ffill", nan_warn_threshold: float = 0.2, ) -> xr.DataArray: """Compute the length of a path travelled between two time points. @@ -728,10 +728,10 @@ def compute_path_length( stop : float, optional The end time of the path. If None (default), the maximum time coordinate in the data is used. - nan_policy : Literal["drop", "scale"], optional - Policy to handle NaN (missing) values. Can be one of the ``"drop"`` - or ``"scale"``. Defaults to ``"drop"``. See Notes for more details - on the two policies. + nan_policy : Literal["ffill", "scale"], optional + Policy to handle NaN (missing) values. Can be one of the ``"ffill"`` + or ``"scale"``. Defaults to ``"ffill"`` (forward fill). + See Notes for more details on the two policies. nan_warn_threshold : float, optional If more than this proportion of values are missing in any point track, a warning will be emitted. Defaults to 0.2 (20%). @@ -745,13 +745,13 @@ def compute_path_length( Notes ----- - Choosing ``nan_policy="drop"`` will drop NaN values from each point track - before computing path length. This equates to assuming that a track - follows a straight line between two valid points adjacent to a missing - segment. Missing segments at the beginning or end of the specified - time range are not counted. This approach tends to underestimate - the path length, and the error increases with the number of missing - values. + Choosing ``nan_policy="ffill"`` will use :meth:`xarray.DataArray.ffill` + to forward-fill missing segments (NaN values) across time. + This equates to assuming that a track remains stationary for + the duration of the missing segment and then instantaneously moves to + the next valid position, following a straight line. This approach tends + to underestimate the path length, and the error increases with the number + of missing values. Choosing ``nan_policy="scale"`` will adjust the path length based on the the proportion of valid segments per point track. For example, if only @@ -774,8 +774,12 @@ def compute_path_length( _warn_about_nan_proportion(data, nan_warn_threshold) - if nan_policy == "drop": - return _compute_path_length_drop_nan(data) + if nan_policy == "ffill": + return compute_norm( + compute_displacement(data.ffill(dim="time")).isel( + time=slice(1, None) + ) # skip first displacement (always 0) + ).sum(dim="time", min_count=1) # return NaN if no valid segment elif nan_policy == "scale": return _compute_scaled_path_length(data) @@ -783,7 +787,7 @@ def compute_path_length( raise log_error( ValueError, f"Invalid value for nan_policy: {nan_policy}. " - "Must be one of 'drop' or 'scale'.", + "Must be one of 'ffill' or 'scale'.", ) @@ -820,7 +824,7 @@ def _warn_about_nan_proportion( log_warning( "The result may be unreliable for point tracks with many " "missing values. The following tracks have more than " - f"{nan_warn_threshold * 100:.3} %) NaN values:", + f"{nan_warn_threshold * 100:.3} % NaN values:", ) print(report_nan_values(data_to_warn_about)) @@ -857,50 +861,3 @@ def _compute_scaled_path_length( valid_proportion = valid_segments / (data.sizes["time"] - 1) # return scaled path length return compute_norm(displacement).sum(dim="time") / valid_proportion - - -def _compute_path_length_drop_nan( - data: xr.DataArray, -) -> xr.DataArray: - """Compute path length by dropping NaN values before computation. - - This function iterates over point tracks, drops NaN values from each - track, and then computes the path length for the remaining valid - segments (takes the sum of the norms of the displacement vectors). - If there is no valid segment in a track, the path length for that - track will be NaN. - - Parameters - ---------- - data : xarray.DataArray - The input data containing position information, with ``time`` - and ``space`` (in Cartesian coordinates) as required dimensions. - - Returns - ------- - xarray.DataArray - An xarray DataArray containing the computed path length, - with dimensions matching those of the input data, - except ``time`` and ``space`` are removed. - - """ - # Create array for holding results - path_length = xr.full_like( - data.isel(time=0, space=0, drop=True), fill_value=np.nan - ) - - # Stack data to iterate over point tracks - dims_to_stack = [d for d in data.dims if d not in ["time", "space"]] - stacked_data = data.stack(tracks=dims_to_stack) - for track_name in stacked_data.tracks.values: - # Drop NaN values from current point track - track_data = stacked_data.sel(tracks=track_name, drop=True).dropna( - dim="time", how="any" - ) - # Compute path length for current point track - # and store it in the result array - target_loc = {k: v for k, v in zip(dims_to_stack, track_name)} - path_length.loc[target_loc] = compute_norm( - compute_displacement(track_data) - ).sum(dim="time", min_count=1) # returns NaN if no valid segment - return path_length diff --git a/tests/test_unit/test_kinematics.py b/tests/test_unit/test_kinematics.py index a642bf24..4a68a23f 100644 --- a/tests/test_unit/test_kinematics.py +++ b/tests/test_unit/test_kinematics.py @@ -277,7 +277,7 @@ def test_path_length_across_time_ranges( "nan_policy, expected_path_lengths_id_1, expected_exception", [ ( - "drop", + "ffill", np.array([np.sqrt(2) * 8, np.sqrt(2) * 9, np.nan]), does_not_raise(), ), @@ -316,8 +316,8 @@ def test_path_length_with_nans( Because the underlying motion is uniform linear, the "scale" policy should perfectly restore the path length for individual "id_1" to its true value. - The "drop" policy should do likewise if frames are missing in the middle, - but will not count any missing frames at the edges. + The "ffill" policy should do likewise if frames are missing in the middle, + but will not "correct" for missing values at the edges. """ position = valid_poses_dataset_uniform_linear_motion_with_nans.position with expected_exception: From 6954b5f01d653dbe82727c6e2c112cc5d551bf9c Mon Sep 17 00:00:00 2001 From: niksirbi Date: Mon, 4 Nov 2024 10:25:55 +0100 Subject: [PATCH 17/18] remove B905 ruff rule --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2d7b1aa4..27348c29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,7 +105,6 @@ fix = true ignore = [ "D203", # one blank line before class "D213", # multi-line-summary second line - "B905", # zip without explicit strict ] select = [ "E", # pycodestyle errors From 13e2326ce85dbe8ea48e6d0d56f9d94390e18276 Mon Sep 17 00:00:00 2001 From: niksirbi Date: Mon, 4 Nov 2024 10:55:22 +0100 Subject: [PATCH 18/18] make pre-commit happy --- movement/kinematics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/movement/kinematics.py b/movement/kinematics.py index 6d4c7c50..12e1514f 100644 --- a/movement/kinematics.py +++ b/movement/kinematics.py @@ -789,7 +789,7 @@ def compute_path_length( f"Invalid value for nan_policy: {nan_policy}. " "Must be one of 'ffill' or 'scale'.", ) - + def _warn_about_nan_proportion( data: xr.DataArray, nan_warn_threshold: float