diff --git a/movement/io/load_bboxes.py b/movement/io/load_bboxes.py index 3e1b0e0d..2841268b 100644 --- a/movement/io/load_bboxes.py +++ b/movement/io/load_bboxes.py @@ -13,7 +13,11 @@ from movement.utils.logging import log_error from movement.validators.datasets import ValidBboxesDataset -from movement.validators.files import ValidFile, ValidVIATracksCSV +from movement.validators.files import ( + DEFAULT_FRAME_REGEXP, + ValidFile, + ValidVIATracksCSV, +) logger = logging.getLogger(__name__) @@ -58,7 +62,7 @@ def from_numpy( bounding boxes are defined. If None (default), frame numbers will be assigned based on the first dimension of the ``position_array``, starting from 0. If a specific array of frame numbers is provided, - these need to be consecutive integers. + these need to be integers sorted in increasing order. fps : float, optional The video sampling rate. If None (default), the ``time`` coordinates of the resulting ``movement`` dataset will be in frame numbers. If @@ -151,6 +155,7 @@ def from_file( source_software: Literal["VIA-tracks"], fps: float | None = None, use_frame_numbers_from_file: bool = False, + frame_regexp: str = DEFAULT_FRAME_REGEXP, ) -> xr.Dataset: """Create a ``movement`` bounding boxes dataset from a supported file. @@ -180,6 +185,13 @@ def from_file( full video as the time origin. If False (default), the frame numbers in the VIA tracks .csv file are instead mapped to a 0-based sequence of consecutive integers. + frame_regexp : str, optional + Regular expression pattern to extract the frame number from the frame + filename. By default, the frame number is expected to be encoded in + the filename as an integer number led by at least one zero, followed + by the file extension. Only used if ``use_frame_numbers_from_file`` is + True. + Returns ------- @@ -214,6 +226,7 @@ def from_file( file_path, fps, use_frame_numbers_from_file=use_frame_numbers_from_file, + frame_regexp=frame_regexp, ) else: raise log_error( @@ -225,6 +238,7 @@ def from_via_tracks_file( file_path: Path | str, fps: float | None = None, use_frame_numbers_from_file: bool = False, + frame_regexp: str = DEFAULT_FRAME_REGEXP, ) -> xr.Dataset: """Create a ``movement`` dataset from a VIA tracks .csv file. @@ -248,6 +262,12 @@ def from_via_tracks_file( but you want to maintain the start of the full video as the time origin. If False (default), the frame numbers in the VIA tracks .csv file are instead mapped to a 0-based sequence of consecutive integers. + frame_regexp : str, optional + Regular expression pattern to extract the frame number from the frame + filename. By default, the frame number is expected to be encoded in + the filename as an integer number led by at least one zero, followed + by the file extension. Only used if ``use_frame_numbers_from_file`` is + True. Returns ------- @@ -316,17 +336,19 @@ def from_via_tracks_file( ) # Specific VIA-tracks .csv file validation - via_file = ValidVIATracksCSV(file.path) + via_file = ValidVIATracksCSV(file.path, frame_regexp=frame_regexp) logger.debug(f"Validated VIA tracks .csv file {via_file.path}.") # Create an xarray.Dataset from the data - bboxes_arrays = _numpy_arrays_from_via_tracks_file(via_file.path) + bboxes_arrays = _numpy_arrays_from_via_tracks_file( + via_file.path, via_file.frame_regexp + ) ds = from_numpy( position_array=bboxes_arrays["position_array"], shape_array=bboxes_arrays["shape_array"], confidence_array=bboxes_arrays["confidence_array"], individual_names=[ - f"id_{id}" for id in bboxes_arrays["ID_array"].squeeze() + f"id_{id.item()}" for id in bboxes_arrays["ID_array"] ], frame_array=( bboxes_arrays["frame_array"] @@ -346,7 +368,9 @@ def from_via_tracks_file( return ds -def _numpy_arrays_from_via_tracks_file(file_path: Path) -> dict: +def _numpy_arrays_from_via_tracks_file( + file_path: Path, frame_regexp: str = DEFAULT_FRAME_REGEXP +) -> dict: """Extract numpy arrays from the input VIA tracks .csv file. The extracted numpy arrays are returned in a dictionary with the following @@ -369,6 +393,12 @@ def _numpy_arrays_from_via_tracks_file(file_path: Path) -> dict: file_path : pathlib.Path Path to the VIA tracks .csv file containing the bounding boxes' tracks. + frame_regexp : str + Regular expression pattern to extract the frame number from the frame + filename. By default, the frame number is expected to be encoded in + the filename as an integer number led by at least one zero, followed + by the file extension. + Returns ------- dict @@ -378,7 +408,7 @@ def _numpy_arrays_from_via_tracks_file(file_path: Path) -> dict: # Extract 2D dataframe from input data # (sort data by ID and frame number, and # fill empty frame-ID pairs with nans) - df = _df_from_via_tracks_file(file_path) + df = _df_from_via_tracks_file(file_path, frame_regexp) # Compute indices of the rows where the IDs switch bool_id_diff_from_prev = df["ID"].ne(df["ID"].shift()) # pandas series @@ -398,8 +428,11 @@ def _numpy_arrays_from_via_tracks_file(file_path: Path) -> dict: df[map_key_to_columns[key]].to_numpy(), indices_id_switch, # indices along axis=0 ) + array_dict[key] = np.stack(list_arrays, axis=1) - array_dict[key] = np.stack(list_arrays, axis=1).squeeze() + # squeeze only last dimension if it is 1 + if array_dict[key].shape[-1] == 1: + array_dict[key] = array_dict[key].squeeze(axis=-1) # Transform position_array to represent centroid of bbox, # rather than top-left corner @@ -413,7 +446,9 @@ def _numpy_arrays_from_via_tracks_file(file_path: Path) -> dict: return array_dict -def _df_from_via_tracks_file(file_path: Path) -> pd.DataFrame: +def _df_from_via_tracks_file( + file_path: Path, frame_regexp: str = DEFAULT_FRAME_REGEXP +) -> pd.DataFrame: """Load VIA tracks .csv file as a dataframe. Read the VIA tracks .csv file as a pandas dataframe with columns: @@ -429,6 +464,10 @@ def _df_from_via_tracks_file(file_path: Path) -> pd.DataFrame: empty frames are filled in with NaNs. The coordinates of the bboxes are assumed to be in the image coordinate system (i.e., the top-left corner of a bbox is its corner with minimum x and y coordinates). + + The frame number is extracted from the filename using the provided + regexp if it is not defined as a 'file_attribute' in the VIA tracks .csv + file. """ # Read VIA tracks .csv file as a pandas dataframe df_file = pd.read_csv(file_path, sep=",", header=0) @@ -439,7 +478,9 @@ def _df_from_via_tracks_file(file_path: Path) -> pd.DataFrame: "ID": _via_attribute_column_to_numpy( df_file, "region_attributes", ["track"], int ), - "frame_number": _extract_frame_number_from_via_tracks_df(df_file), + "frame_number": _extract_frame_number_from_via_tracks_df( + df_file, frame_regexp + ), "x": _via_attribute_column_to_numpy( df_file, "region_shape_attributes", ["x"], float ), @@ -473,7 +514,7 @@ def _df_from_via_tracks_file(file_path: Path) -> pd.DataFrame: return df -def _extract_confidence_from_via_tracks_df(df) -> np.ndarray: +def _extract_confidence_from_via_tracks_df(df: pd.DataFrame) -> np.ndarray: """Extract confidence scores from the VIA tracks input dataframe. Parameters @@ -504,7 +545,9 @@ def _extract_confidence_from_via_tracks_df(df) -> np.ndarray: return bbox_confidence -def _extract_frame_number_from_via_tracks_df(df) -> np.ndarray: +def _extract_frame_number_from_via_tracks_df( + df: pd.DataFrame, frame_regexp: str = DEFAULT_FRAME_REGEXP +) -> np.ndarray: """Extract frame numbers from the VIA tracks input dataframe. Parameters @@ -513,14 +556,20 @@ def _extract_frame_number_from_via_tracks_df(df) -> np.ndarray: The VIA tracks input dataframe is the one obtained from ``df = pd.read_csv(file_path, sep=",", header=0)``. + frame_regexp : str + Regular expression pattern to extract the frame number from the frame + filename. By default, the frame number is expected to be encoded in + the filename as an integer number led by at least one zero, followed by + the file extension. + Returns ------- np.ndarray A numpy array of size (n_frames, ) containing the frame numbers. In the VIA tracks .csv file, the frame number is expected to be defined as a 'file_attribute' , or encoded in the filename as an - integer number led by at least one zero, between "_" and ".", followed - by the file extension. + integer number led by at least one zero, followed by the file + extension. """ # Extract frame number from file_attributes if exists @@ -534,10 +583,9 @@ def _extract_frame_number_from_via_tracks_df(df) -> np.ndarray: ) # Else extract from filename else: - pattern = r"_(0\d*)\.\w+$" list_frame_numbers = [ - int(re.search(pattern, f).group(1)) # type: ignore - if re.search(pattern, f) + int(re.search(frame_regexp, f).group(1)) # type: ignore + if re.search(frame_regexp, f) else np.nan for f in df["filename"] ] diff --git a/movement/validators/datasets.py b/movement/validators/datasets.py index 99a68c10..eb1ca976 100644 --- a/movement/validators/datasets.py +++ b/movement/validators/datasets.py @@ -349,11 +349,12 @@ def _validate_frame_array(self, attribute, value): value, expected_shape=(self.position_array.shape[0], 1), ) - # check frames are continuous: exactly one frame number per row - if not np.all(np.diff(value, axis=0) == 1): + # check frames are monotonically increasing + if not np.all(np.diff(value, axis=0) >= 1): raise log_error( ValueError, - f"Frame numbers in {attribute.name} are not continuous.", + f"Frame numbers in {attribute.name} are not monotonically " + "increasing.", ) # Define defaults diff --git a/movement/validators/files.py b/movement/validators/files.py index 8d013a95..0bc5c446 100644 --- a/movement/validators/files.py +++ b/movement/validators/files.py @@ -12,6 +12,8 @@ from movement.utils.logging import log_error +DEFAULT_FRAME_REGEXP = r"(0\d*)\.\w+$" + @define class ValidFile: @@ -234,6 +236,11 @@ class ValidVIATracksCSV: ---------- path : pathlib.Path Path to the VIA tracks .csv file. + frame_regexp : str + Regular expression pattern to extract the frame number from the + filename. By default, the frame number is expected to be encoded in + the filename as an integer number led by at least one zero, followed + by the file extension. Raises ------ @@ -243,6 +250,7 @@ class ValidVIATracksCSV: """ path: Path = field(validator=validators.instance_of(Path)) + frame_regexp: str = DEFAULT_FRAME_REGEXP @path.validator def _file_contains_valid_header(self, attribute, value): @@ -281,8 +289,10 @@ def _file_contains_valid_frame_numbers(self, attribute, value): files. If the frame number is included as part of the image file name, then - it is expected as an integer led by at least one zero, between "_" and - ".", followed by the file extension. + it is expected to be captured by the regular expression in the + `frame_regexp` attribute of the ValidVIATracksCSV object. The default + regexp matches an integer led by at least one zero, followed by the + file extension. """ df = pd.read_csv(value, sep=",", header=0) @@ -294,40 +304,15 @@ def _file_contains_valid_frame_numbers(self, attribute, value): # If 'frame' is a file_attribute for all files: # extract frame number - list_frame_numbers = [] if all(["frame" in d for d in file_attributes_dicts]): - for k_i, k in enumerate(file_attributes_dicts): - try: - list_frame_numbers.append(int(k["frame"])) - except Exception as e: - raise log_error( - ValueError, - f"{df.filename.iloc[k_i]} (row {k_i}): " - "'frame' file attribute cannot be cast as an integer. " - f"Please review the file attributes: {k}.", - ) from e - + list_frame_numbers = ( + self._extract_frame_numbers_from_file_attributes( + df, file_attributes_dicts + ) + ) # else: extract frame number from filename. else: - pattern = r"_(0\d*)\.\w+$" - - for f_i, f in enumerate(df["filename"]): - regex_match = re.search(pattern, f) - if regex_match: # if there is a pattern match - list_frame_numbers.append( - int(regex_match.group(1)) # type: ignore - # the match will always be castable as integer - ) - else: - raise log_error( - ValueError, - f"{f} (row {f_i}): " - "a frame number could not be extracted from the " - "filename. If included in the filename, the frame " - "number is expected as a zero-padded integer between " - "an underscore '_' and the file extension " - "(e.g. img_00234.png).", - ) + list_frame_numbers = self._extract_frame_numbers_using_regexp(df) # Check we have as many unique frame numbers as unique image files if len(set(list_frame_numbers)) != len(df.filename.unique()): @@ -339,6 +324,60 @@ def _file_contains_valid_frame_numbers(self, attribute, value): "file. ", ) + def _extract_frame_numbers_from_file_attributes( + self, df, file_attributes_dicts + ): + """Get frame numbers from the 'frame' key under 'file_attributes'.""" + list_frame_numbers = [] + for k_i, k in enumerate(file_attributes_dicts): + try: + list_frame_numbers.append(int(k["frame"])) + except ValueError as e: + raise log_error( + ValueError, + f"{df.filename.iloc[k_i]} (row {k_i}): " + "'frame' file attribute cannot be cast as an integer. " + f"Please review the file attributes: {k}.", + ) from e + return list_frame_numbers + + def _extract_frame_numbers_using_regexp(self, df): + """Get frame numbers from the file names using the provided regexp.""" + list_frame_numbers = [] + for f_i, f in enumerate(df["filename"]): + # try compiling the frame regexp + try: + regex_match = re.search(self.frame_regexp, f) + except re.error as e: + raise log_error( + re.error, + "The provided regular expression for the frame " + f"numbers ({self.frame_regexp}) could not be compiled." + " Please review its syntax.", + ) from e + # try extracting the frame number from the filename using the + # compiled regexp + try: + list_frame_numbers.append(int(regex_match.group(1))) + except AttributeError as e: + raise log_error( + AttributeError, + f"{f} (row {f_i}): The provided frame regexp " + f"({self.frame_regexp}) did not " + "return any matches and a frame number could not " + "be extracted from the filename.", + ) from e + except ValueError as e: + raise log_error( + ValueError, + f"{f} (row {f_i}): " + "The frame number extracted from the filename using " + f"the provided regexp ({self.frame_regexp}) could not " + "be cast as an integer.", + ) from e + + return list_frame_numbers + @path.validator def _file_contains_tracked_bboxes(self, attribute, value): """Ensure that the VIA tracks .csv contains tracked bounding boxes. diff --git a/tests/test_unit/test_load_bboxes.py b/tests/test_unit/test_load_bboxes.py index 2f80459d..6509bf21 100644 --- a/tests/test_unit/test_load_bboxes.py +++ b/tests/test_unit/test_load_bboxes.py @@ -1,6 +1,8 @@ """Test suite for the load_bboxes module.""" import ast +import re +from pathlib import Path from unittest.mock import patch import numpy as np @@ -13,103 +15,183 @@ @pytest.fixture() -def via_tracks_file(): - """Return the file path for a VIA tracks .csv file.""" - via_sample_file_name = "VIA_multiple-crabs_5-frames_labels.csv" - return pytest.DATA_PATHS.get(via_sample_file_name) - - -@pytest.fixture() -def valid_from_numpy_inputs_required_arrays(): - """Return a dictionary with valid numpy arrays for the `from_numpy()` - loader, excluding the optional `frame_array`. - """ - n_frames = 5 - n_individuals = 86 - n_space = 2 - individual_names_array = np.arange(n_individuals).reshape(-1, 1) - - rng = np.random.default_rng(seed=42) +def get_expected_attributes_dict(): + """Define a factory of expected attributes dictionaries.""" + + def _get_expected_attributes_dict(via_file_name: str) -> dict: + """Return the expected attributes dictionary for the first 3 rows of + the input VIA file. + """ + attributes_dict_per_test_file = {} + + # add expected attributes for the first 3 rows of + # VIA_single-crab_MOCA-crab-1.csv + attributes_dict_per_test_file["VIA_single-crab_MOCA-crab-1.csv"] = { + "region_shape_attributes": { + "name": np.array(["rect"] * 3), + "x_y": np.array( + [ + [957.85, 296.708], + [919.149, 296.708], + [889.317, 303.158], + ] + ).reshape(-1, 2), + "width_height": np.array( + [ + [320.09, 153.191], + [357.984, 158.835], + [366.853, 162.867], + ] + ).reshape(-1, 2), + }, + "region_attributes": { + "track": np.ones((3,)), + }, + } + + # add expected attributes for the first 3 rows of + # "VIA_multiple-crabs_5-frames_labels.csv" + attributes_dict_per_test_file[ + "VIA_multiple-crabs_5-frames_labels.csv" + ] = { + "region_shape_attributes": { + "name": np.array(["rect"] * 3), + "x_y": np.array( + [ + [526.2366942646654, 393.280914246804], + [2565, 468], + [759.6484377108334, 136.60946673708338], + ] + ).reshape(-1, 2), + "width_height": np.array( + [[46, 38], [41, 30], [29, 25]] + ).reshape(-1, 2), + }, + "region_attributes": { + "track": np.array([71, 70, 69]), + }, + } + + # return ValueError if the filename is not defined + if via_file_name not in attributes_dict_per_test_file: + raise ValueError( + f"Attributes dict not defined for filename '{via_file_name}''." + ) + return attributes_dict_per_test_file[via_file_name] - return { - "position_array": rng.random((n_frames, n_individuals, n_space)), - "shape_array": rng.random((n_frames, n_individuals, n_space)), - "confidence_array": rng.random((n_frames, n_individuals)), - "individual_names": [ - f"id_{id}" for id in individual_names_array.squeeze() - ], - } + return _get_expected_attributes_dict @pytest.fixture() -def valid_from_numpy_inputs_all_arrays( - valid_from_numpy_inputs_required_arrays, -): - """Return a dictionary with valid numpy arrays for the from_numpy() loader, - including a `frame_array` that ranges from frame 1 to 5. - """ - n_frames = valid_from_numpy_inputs_required_arrays["position_array"].shape[ - 0 - ] - first_frame_number = 1 # should match sample file - - valid_from_numpy_inputs_required_arrays["frame_array"] = np.arange( - first_frame_number, first_frame_number + n_frames - ).reshape(-1, 1) - - return valid_from_numpy_inputs_required_arrays +def create_df_input_via_tracks(): + """Define a factory of dataframes for testing.""" + + def _create_df_input_via_tracks( + via_file_path: Path, + small: bool = False, + attribute_column_additions: dict[str, list[dict]] | None = None, + ) -> pd.DataFrame: + """Return an optionally modified dataframe that results from + reading the input VIA tracks .csv filepath. + + If small is True, only the first 3 rows are returned. + + If attribute_column_additions is not None, the dataframe is modified + to include the data under the specified attribute column. + + The variable attribute_column_additions is a dictionary mapping the + name of the attribute column to a list of dictionaries to append to + that column. + """ + # read the VIA tracks .csv file as a dataframe + df = pd.read_csv(via_file_path, sep=",", header=0) + + # optionally return the first 3 rows only + if small: + df = df.loc[:2, :] + + # optionally modify the dataframe to include data + # under an attribute column + if attribute_column_additions is None: + return df + else: + return update_attribute_column( + df_input=df, + attribute_column_additions=attribute_column_additions, + ) + return _create_df_input_via_tracks -@pytest.fixture() -def df_input_via_tracks_small(via_tracks_file): - """Return the first 3 rows of the VIA tracks .csv file as a dataframe.""" - df = pd.read_csv(via_tracks_file, sep=",", header=0) - return df.loc[:2, :] +def update_attribute_column( + df_input: pd.DataFrame, + attribute_column_additions: dict[str, list[dict]], +): + """Update an attributes column in the dataframe.""" + # copy the dataframe + df = df_input.copy() -@pytest.fixture() -def df_input_via_tracks_small_with_confidence(df_input_via_tracks_small): - """Return a dataframe with the first three rows of the VIA tracks .csv file - and add confidence values to the bounding boxes. - """ - df = update_attribute_column( - df_input=df_input_via_tracks_small, - attribute_column_name="region_attributes", - dict_to_append={"confidence": "0.5"}, - ) + # update each attribute column in the dataframe + for attribute_column_name in attribute_column_additions: + # get the list of dicts to append to the column + list_dicts_to_append = attribute_column_additions[ + attribute_column_name + ] + + # get the column to update, and convert it to a list of dicts + # (one dict per row) + attributes_dicts = [ + ast.literal_eval(d) for d in df[attribute_column_name] + ] + + # update each dict in the list + # (if we only have one dict to append, append it to all rows) + if len(list_dicts_to_append) == 1: + for d in attributes_dicts: + d.update(list_dicts_to_append[0]) + else: + for d, dict_to_append in zip( + attributes_dicts, list_dicts_to_append, strict=True + ): + d.update(dict_to_append) + + # update the relevant column in the dataframe and format + # back to string + df[attribute_column_name] = [str(d) for d in attributes_dicts] return df @pytest.fixture() -def df_input_via_tracks_small_with_frame_number(df_input_via_tracks_small): - """Return a dataframe with the first three rows of the VIA tracks .csv file - and add frame number values to the bounding boxes. - """ - df = update_attribute_column( - df_input=df_input_via_tracks_small, - attribute_column_name="file_attributes", - dict_to_append={"frame": "1"}, - ) - - return df +def create_valid_from_numpy_inputs(): + """Define a factory of valid inputs to "from_numpy" function.""" + n_frames = 5 + n_individuals = 86 + n_space = 2 + individual_names_array = np.arange(n_individuals).reshape(-1, 1) + first_frame_number = 1 # should match sample file + rng = np.random.default_rng(seed=42) -def update_attribute_column(df_input, attribute_column_name, dict_to_append): - """Update an attributes column in the dataframe.""" - # copy the dataframe - df = df_input.copy() + def _create_valid_from_numpy_inputs(with_frame_array=False): + """Return a dictionary of valid inputs to the `from_numpy` function.""" + required_inputs = { + "position_array": rng.random((n_frames, n_individuals, n_space)), + "shape_array": rng.random((n_frames, n_individuals, n_space)), + "confidence_array": rng.random((n_frames, n_individuals)), + "individual_names": [ + f"id_{id}" for id in individual_names_array.squeeze() + ], + } - # get the attributes column and convert to dict - attributes_dicts = [ast.literal_eval(d) for d in df[attribute_column_name]] + if with_frame_array: + required_inputs["frame_array"] = np.arange( + first_frame_number, first_frame_number + n_frames + ).reshape(-1, 1) - # update the dict - for d in attributes_dicts: - d.update(dict_to_append) + return required_inputs - # update the region_attributes column in the dataframe - df[attribute_column_name] = [str(d) for d in attributes_dicts] - return df + return _create_valid_from_numpy_inputs def assert_dataset( @@ -152,31 +234,39 @@ def assert_dataset( ) -def assert_time_coordinates(ds, fps, start_frame): +def assert_time_coordinates(ds, fps, start_frame=None, frame_array=None): """Assert that the time coordinates are as expected, depending on - fps value and start_frame. + fps value and start_frame or time_array. start_frame takes precedence + over frame_array if both are provided. """ # scale time coordinates with 1/fps if provided scale = 1 / fps if fps else 1 + # build frame array from start_frame if provided + if start_frame is not None: + frame_array = np.array( + range(start_frame, len(ds.coords["time"].data) + start_frame) + ) + elif frame_array is None: + raise ValueError( + "Either start_frame or frame_array must be provided." + "start_frame takes precedence over frame_array if " + "both are provided." + ) + # assert numpy array of time coordinates np.testing.assert_allclose( - ds.coords["time"].data, - np.array( - [ - f * scale - for f in range( - start_frame, len(ds.coords["time"].data) + start_frame - ) - ] - ), + ds.coords["time"].data, np.array([f * scale for f in frame_array]) ) @pytest.mark.parametrize("source_software", ["Unknown", "VIA-tracks"]) @pytest.mark.parametrize("fps", [None, 30, 60.0]) @pytest.mark.parametrize("use_frame_numbers_from_file", [True, False]) -def test_from_file(source_software, fps, use_frame_numbers_from_file): +@pytest.mark.parametrize("frame_regexp", [None, r"frame_(\d+)"]) +def test_from_file( + source_software, fps, use_frame_numbers_from_file, frame_regexp +): """Test that the from_file() function delegates to the correct loader function according to the source_software. """ @@ -191,6 +281,7 @@ def test_from_file(source_software, fps, use_frame_numbers_from_file): source_software, fps, use_frame_numbers_from_file=use_frame_numbers_from_file, + frame_regexp=frame_regexp, ) else: with patch(software_to_loader[source_software]) as mock_loader: @@ -199,50 +290,107 @@ def test_from_file(source_software, fps, use_frame_numbers_from_file): source_software, fps, use_frame_numbers_from_file=use_frame_numbers_from_file, + frame_regexp=frame_regexp, ) mock_loader.assert_called_with( "some_file", fps, use_frame_numbers_from_file=use_frame_numbers_from_file, + frame_regexp=frame_regexp, ) +@pytest.mark.parametrize( + "via_file_path", + [ + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + ], +) @pytest.mark.parametrize("fps", [None, 30, 60.0]) @pytest.mark.parametrize("use_frame_numbers_from_file", [True, False]) +@pytest.mark.parametrize("frame_regexp", [None, r"(00\d*)\.\w+$"]) def test_from_via_tracks_file( - via_tracks_file, fps, use_frame_numbers_from_file + via_file_path, + fps, + use_frame_numbers_from_file, + frame_regexp, ): """Test that loading tracked bounding box data from - a valid VIA tracks .csv file returns a proper Dataset - and that the time coordinates are as expected. + a valid VIA tracks .csv file returns a proper Dataset. """ - # run general dataset checks - ds = load_bboxes.from_via_tracks_file( - via_tracks_file, fps, use_frame_numbers_from_file - ) - assert_dataset(ds, via_tracks_file, "VIA-tracks", fps) - - # check time coordinates are as expected - # in sample VIA tracks .csv file frame numbers start from 1 - start_frame = 1 if use_frame_numbers_from_file else 0 - assert_time_coordinates(ds, fps, start_frame) + kwargs = { + "file_path": via_file_path, + "fps": fps, + "use_frame_numbers_from_file": use_frame_numbers_from_file, + **({"frame_regexp": frame_regexp} if frame_regexp is not None else {}), + } + ds = load_bboxes.from_via_tracks_file(**kwargs) + assert_dataset(ds, via_file_path, "VIA-tracks", fps) @pytest.mark.parametrize( - "valid_from_numpy_inputs", + "frame_regexp, error_type, log_message", [ - "valid_from_numpy_inputs_required_arrays", - "valid_from_numpy_inputs_all_arrays", + ( + r"*", + re.error, + "The provided regular expression for the frame numbers (*) " + "could not be compiled. Please review its syntax.", + ), + ( + r"_(0\d*)_$", + AttributeError, + "/crab_1/00000.jpg (row 0): " + "The provided frame regexp (_(0\d*)_$) did not return any " + "matches and a frame number could not be extracted from " + "the filename.", + ), + ( + r"(0\d*\.\w+)$", + ValueError, + "/crab_1/00000.jpg (row 0): " + "The frame number extracted from the filename " + "using the provided regexp ((0\d*\.\w+)$) " + "could not be cast as an integer.", + ), ], ) +def test_from_via_tracks_file_invalid_frame_regexp( + frame_regexp, error_type, log_message +): + """Test that loading tracked bounding box data from + a valid VIA tracks .csv file with an invalid frame_regexp + raises a ValueError. + """ + input_file = pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv") + with pytest.raises(error_type) as excinfo: + load_bboxes.from_via_tracks_file( + input_file, + use_frame_numbers_from_file=True, + frame_regexp=frame_regexp, + ) + + assert str(excinfo.value) == log_message + + +@pytest.mark.parametrize( + "with_frame_array", + [True, False], +) @pytest.mark.parametrize("fps", [None, 30, 60.0]) @pytest.mark.parametrize("source_software", [None, "VIA-tracks"]) -def test_from_numpy(valid_from_numpy_inputs, fps, source_software, request): +def test_from_numpy( + create_valid_from_numpy_inputs, + with_frame_array, + fps, + source_software, +): """Test that loading bounding boxes trajectories from the input numpy arrays returns a proper Dataset. """ # get the input arrays - from_numpy_inputs = request.getfixturevalue(valid_from_numpy_inputs) + from_numpy_inputs = create_valid_from_numpy_inputs(with_frame_array) # run general dataset checks ds = load_bboxes.from_numpy( @@ -263,113 +411,221 @@ def test_from_numpy(valid_from_numpy_inputs, fps, source_software, request): @pytest.mark.parametrize( - "via_column_name, list_keys, cast_fn, expected_attribute_array", + "via_file_path", + [ + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + ], +) +@pytest.mark.parametrize( + "via_column_name, list_keys, cast_fn", [ - ( - "file_attributes", - ["clip"], - int, - np.array([123] * 3), # .reshape(-1, 1), - ), ( "region_shape_attributes", ["name"], str, - np.array(["rect"] * 3), # .reshape(-1, 1), ), ( "region_shape_attributes", ["x", "y"], float, - np.array( - [ - [526.2366942646654, 393.280914246804], - [2565, 468], - [759.6484377108334, 136.60946673708338], - ] - ).reshape(-1, 2), ), ( "region_shape_attributes", ["width", "height"], float, - np.array([[46, 38], [41, 30], [29, 25]]).reshape(-1, 2), ), ( "region_attributes", ["track"], int, - np.array([71, 70, 69]), # .reshape(-1, 1), ), ], ) def test_via_attribute_column_to_numpy( - df_input_via_tracks_small, + create_df_input_via_tracks, + get_expected_attributes_dict, + via_file_path, via_column_name, list_keys, cast_fn, - expected_attribute_array, ): """Test that the function correctly extracts the desired data from the VIA attributes. """ attribute_array = load_bboxes._via_attribute_column_to_numpy( - df=df_input_via_tracks_small, + df=create_df_input_via_tracks( + via_file_path, small=True + ), # small=True to only get 3 rows via_column_name=via_column_name, list_keys=list_keys, cast_fn=cast_fn, ) + attributes_dict = get_expected_attributes_dict( + via_file_path.name + ) # returns results for the first 3 rows + expected_attribute_array = attributes_dict[via_column_name][ + "_".join(list_keys) + ] assert np.array_equal(attribute_array, expected_attribute_array) @pytest.mark.parametrize( - "df_input, expected_array", + "via_file_path", + [ + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + ], +) +@pytest.mark.parametrize( + "input_confidence_value, expected_confidence_array", + # we only check the first 3 rows of the files [ - ("df_input_via_tracks_small", np.full((3,), np.nan)), ( - "df_input_via_tracks_small_with_confidence", + None, + np.full((3,), np.nan), + ), + ( + 0.5, np.array([0.5, 0.5, 0.5]), ), ], ) def test_extract_confidence_from_via_tracks_df( - df_input, expected_array, request + create_df_input_via_tracks, + via_file_path, + input_confidence_value, + expected_confidence_array, ): """Test that the function correctly extracts the confidence values from the VIA dataframe. + + A mock VIA dataframe is generated with all confidence values set to the + input_confidence_value. """ - df = request.getfixturevalue(df_input) + # None of the sample files includes a confidence column + # so we add it to the dataframe here + if input_confidence_value: + df = create_df_input_via_tracks( + via_file_path, + small=True, # only get 3 rows + attribute_column_additions={ + "region_attributes": [{"confidence": input_confidence_value}] + }, + ) + else: + df = create_df_input_via_tracks(via_file_path, small=True) + confidence_array = load_bboxes._extract_confidence_from_via_tracks_df(df) - assert np.array_equal(confidence_array, expected_array, equal_nan=True) + assert np.array_equal( + confidence_array, expected_confidence_array, equal_nan=True + ) @pytest.mark.parametrize( - "df_input, expected_array", + "via_file_path, expected_frame_array", [ ( - "df_input_via_tracks_small", + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), np.ones((3,)), - ), # extract from filename + ), ( - "df_input_via_tracks_small_with_frame_number", - np.array([1, 1, 1]), - ), # extract from file_attributes + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + np.array([0, 5, 10]), + ), ], ) -def test_extract_frame_number_from_via_tracks_df( - df_input, expected_array, request +def test_extract_frame_number_from_via_tracks_df_filenames( + create_df_input_via_tracks, + via_file_path, + expected_frame_array, ): """Test that the function correctly extracts the frame number values from - the VIA dataframe. + the images' filenames. """ - df = request.getfixturevalue(df_input) + # create the dataframe with the frame number + df = create_df_input_via_tracks( + via_file_path, + small=True, + ) + + # the VIA tracks .csv files have no frames defined under the + # "file_attributes" so the frame numbers should be extracted + # from the filenames + assert not all(["frame" in row for row in df["file_attributes"]]) + + # extract frame number from df frame_array = load_bboxes._extract_frame_number_from_via_tracks_df(df) - assert np.array_equal(frame_array, expected_array) + assert np.array_equal(frame_array, expected_frame_array) +@pytest.mark.parametrize( + "via_file_path, attribute_column_additions, expected_frame_array", + [ + ( + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + {"file_attributes": [{"frame": 222}]}, + np.ones( + 3, + ) + * 222, + ), + ( + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + { + "file_attributes": [ + {"frame": 218}, + {"frame": 219}, + {"frame": 220}, + ] + }, + np.array([218, 219, 220]), + ), + ], +) +def test_extract_frame_number_from_via_tracks_df_file_attributes( + create_df_input_via_tracks, + via_file_path, + attribute_column_additions, + expected_frame_array, +): + """Test that the function correctly extracts the frame number values from + the file attributes column. + + The frame number defined under the "file_attributes" column + should take precedence over the frame numbers encoded in the filenames. + """ + # Create the dataframe with the frame number stored in + # the file_attributes column + df = create_df_input_via_tracks( + via_file_path, + small=True, + attribute_column_additions=attribute_column_additions, + ) + + # extract frame number from the dataframe + # (should take precedence over the frame numbers in the filenames) + frame_array = load_bboxes._extract_frame_number_from_via_tracks_df(df) + + assert np.array_equal(frame_array, expected_frame_array) + + +@pytest.mark.parametrize( + "via_file_path, frame_array_from_file", + [ + ( + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + np.array(range(1, 6)), + ), + ( + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + np.array(list(range(0, 168, 5)) + [167]), + ), + ], +) @pytest.mark.parametrize( "fps, expected_fps, expected_time_unit", [ @@ -382,56 +638,69 @@ def test_extract_frame_number_from_via_tracks_df( ) @pytest.mark.parametrize("use_frame_numbers_from_file", [True, False]) def test_fps_and_time_coords( - via_tracks_file, + via_file_path, + frame_array_from_file, fps, expected_fps, expected_time_unit, use_frame_numbers_from_file, ): - """Test that fps conversion is as expected and time coordinates are set - according to the expected fps. + """Test that fps conversion is as expected, and time coordinates are set + according to the input "fps" and the "use_frame_numbers_from_file" + parameters. """ + # load dataset with inputs ds = load_bboxes.from_via_tracks_file( - via_tracks_file, + via_file_path, fps=fps, use_frame_numbers_from_file=use_frame_numbers_from_file, ) - # load dataset with frame numbers from file - ds_in_frames_from_file = load_bboxes.from_via_tracks_file( - via_tracks_file, - fps=None, - use_frame_numbers_from_file=True, - ) - # check time unit assert ds.time_unit == expected_time_unit # check fps is as expected - if expected_fps is None: - assert ds.fps is expected_fps - else: + if bool(expected_fps): assert ds.fps == expected_fps + else: + assert ds.fps is None - # check time coordinates + # check loading frame numbers from file if use_frame_numbers_from_file: - start_frame = ds_in_frames_from_file.coords["time"].data[0] + assert_time_coordinates( + ds, expected_fps, frame_array=frame_array_from_file + ) else: - start_frame = 0 - assert_time_coordinates(ds, expected_fps, start_frame) + assert_time_coordinates(ds, expected_fps, start_frame=0) -def test_df_from_via_tracks_file(via_tracks_file): - """Test that the helper function correctly reads the VIA tracks .csv file - as a dataframe. +@pytest.mark.parametrize( + "via_file_path, expected_n_frames, expected_n_individuals", + [ + ( + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + 5, + 86, + ), + (pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), 35, 1), + ], +) +def test_df_from_via_tracks_file( + via_file_path, expected_n_frames, expected_n_individuals +): + """Test that the `_df_from_via_tracks_file` helper function correctly + reads the VIA tracks .csv file as a dataframe. """ - df = load_bboxes._df_from_via_tracks_file(via_tracks_file) + df = load_bboxes._df_from_via_tracks_file( + via_file_path, + ) assert isinstance(df, pd.DataFrame) - assert len(df.frame_number.unique()) == 5 + assert len(df.frame_number.unique()) == expected_n_frames + assert len(df.ID.unique()) == expected_n_individuals assert ( - df.shape[0] == len(df.ID.unique()) * 5 - ) # all individuals in all frames (even if nan) + df.shape[0] == len(df.ID.unique()) * expected_n_frames + ) # all individuals are present in all frames (even if nan) assert list(df.columns) == [ "ID", "frame_number", @@ -443,20 +712,27 @@ def test_df_from_via_tracks_file(via_tracks_file): ] -def test_position_numpy_array_from_via_tracks_file(via_tracks_file): +@pytest.mark.parametrize( + "via_file_path", + [ + pytest.DATA_PATHS.get("VIA_multiple-crabs_5-frames_labels.csv"), + pytest.DATA_PATHS.get("VIA_single-crab_MOCA-crab-1.csv"), + ], +) +def test_position_numpy_array_from_via_tracks_file(via_file_path): """Test the extracted position array from the VIA tracks .csv file represents the centroid of the bbox. """ # Extract numpy arrays from VIA tracks .csv file bboxes_arrays = load_bboxes._numpy_arrays_from_via_tracks_file( - via_tracks_file + via_file_path ) # Read VIA tracks .csv file as a dataframe - df = load_bboxes._df_from_via_tracks_file(via_tracks_file) + df = load_bboxes._df_from_via_tracks_file(via_file_path) # Compute centroid positions from the dataframe - # (go thru in the same order as ID array) + # (go through in the same order as ID array) list_derived_centroids = [] for id in bboxes_arrays["ID_array"]: df_one_id = df[df["ID"] == id.item()] diff --git a/tests/test_unit/test_validators/test_datasets_validators.py b/tests/test_unit/test_validators/test_datasets_validators.py index e41331f7..e5be81c4 100644 --- a/tests/test_unit/test_validators/test_datasets_validators.py +++ b/tests/test_unit/test_validators/test_datasets_validators.py @@ -360,10 +360,18 @@ def test_bboxes_dataset_validator_confidence_array( f"Expected a numpy array, but got {type(list())}.", ), # not an ndarray, should raise ValueError ( - np.array([1, 2, 3, 4, 6, 7, 8, 9, 10, 11]).reshape(-1, 1), + np.array([1, 10, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(-1, 1), pytest.raises(ValueError), - "Frame numbers in frame_array are not continuous.", - ), # frame numbers are not continuous + "Frame numbers in frame_array are not monotonically increasing.", + ), + ( + np.array([1, 2, 22, 23, 24, 25, 100, 101, 102, 103]).reshape( + -1, 1 + ), + does_not_raise(), + "", + ), # valid, frame numbers are not continuous but are monotonically + # increasing ( None, does_not_raise(), @@ -389,10 +397,10 @@ def test_bboxes_dataset_validator_frame_array( frame_array=frame_array, ) - if frame_array is None: + if frame_array is not None: + assert str(getattr(excinfo, "value", "")) == log_message + else: n_frames = ds.position_array.shape[0] default_frame_array = np.arange(n_frames).reshape(-1, 1) assert np.array_equal(ds.frame_array, default_frame_array) assert ds.frame_array.shape == (ds.position_array.shape[0], 1) - else: - assert str(excinfo.value) == log_message diff --git a/tests/test_unit/test_validators/test_files_validators.py b/tests/test_unit/test_validators/test_files_validators.py index b9bc345c..7ce0a722 100644 --- a/tests/test_unit/test_validators/test_files_validators.py +++ b/tests/test_unit/test_validators/test_files_validators.py @@ -68,10 +68,11 @@ def test_deeplabcut_csv_validator_with_invalid_input( @pytest.mark.parametrize( - "invalid_input, log_message", + "invalid_input, error_type, log_message", [ ( "via_tracks_csv_with_invalid_header", + ValueError, ".csv header row does not match the known format for " "VIA tracks .csv files. " "Expected " @@ -82,6 +83,7 @@ def test_deeplabcut_csv_validator_with_invalid_input( ), ( "frame_number_in_file_attribute_not_integer", + ValueError, "04.09.2023-04-Right_RE_test_frame_A.png (row 0): " "'frame' file attribute cannot be cast as an integer. " "Please review the file attributes: " @@ -89,33 +91,37 @@ def test_deeplabcut_csv_validator_with_invalid_input( ), ( "frame_number_in_filename_wrong_pattern", + AttributeError, "04.09.2023-04-Right_RE_test_frame_1.png (row 0): " - "a frame number could not be extracted from the filename. " - "If included in the filename, the frame number is " - "expected as a zero-padded integer between an " - "underscore '_' and the file extension " - "(e.g. img_00234.png).", + "The provided frame regexp ((0\d*)\.\w+$) did not return " + "any matches and a " + "frame number could not be extracted from the " + "filename.", ), ( "more_frame_numbers_than_filenames", + ValueError, "The number of unique frame numbers does not match the number " "of unique image files. Please review the VIA tracks .csv file " "and ensure a unique frame number is defined for each file. ", ), ( "less_frame_numbers_than_filenames", + ValueError, "The number of unique frame numbers does not match the number " "of unique image files. Please review the VIA tracks .csv file " "and ensure a unique frame number is defined for each file. ", ), ( "region_shape_attribute_not_rect", + ValueError, "04.09.2023-04-Right_RE_test_frame_01.png (row 0): " "bounding box shape must be 'rect' (rectangular) " "but instead got 'circle'.", ), ( "region_shape_attribute_missing_x", + ValueError, "04.09.2023-04-Right_RE_test_frame_01.png (row 0): " "at least one bounding box shape parameter is missing. " "Expected 'x', 'y', 'width', 'height' to exist as " @@ -124,6 +130,7 @@ def test_deeplabcut_csv_validator_with_invalid_input( ), ( "region_attribute_missing_track", + ValueError, "04.09.2023-04-Right_RE_test_frame_01.png (row 0): " "bounding box does not have a 'track' attribute defined " "under 'region_attributes'. " @@ -131,6 +138,7 @@ def test_deeplabcut_csv_validator_with_invalid_input( ), ( "track_id_not_castable_as_int", + ValueError, "04.09.2023-04-Right_RE_test_frame_01.png (row 0): " "the track ID for the bounding box cannot be cast " "as an integer. " @@ -138,6 +146,7 @@ def test_deeplabcut_csv_validator_with_invalid_input( ), ( "track_ids_not_unique_per_frame", + ValueError, "04.09.2023-04-Right_RE_test_frame_01.png: " "multiple bounding boxes in this file have the same track ID. " "Please review the VIA tracks .csv file.", @@ -145,7 +154,7 @@ def test_deeplabcut_csv_validator_with_invalid_input( ], ) def test_via_tracks_csv_validator_with_invalid_input( - invalid_input, log_message, request + invalid_input, error_type, log_message, request ): """Test that invalid VIA tracks .csv files raise the appropriate errors. @@ -162,7 +171,7 @@ def test_via_tracks_csv_validator_with_invalid_input( - error if bboxes IDs are not 1-based integers """ file_path = request.getfixturevalue(invalid_input) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(error_type) as excinfo: ValidVIATracksCSV(file_path) assert str(excinfo.value) == log_message