catalystneuro · CodyCBakerPhD · Feb 23, 2022 · Jan 17, 2022 · Jan 17, 2022 · Jan 17, 2022
diff --git a/nwb_conversion_tools/datainterfaces/behavior/movie/movie_utils.py b/nwb_conversion_tools/datainterfaces/behavior/movie/movie_utils.py
@@ -1,7 +1,9 @@
-"""Authors: Cody Baker."""
+"""Authors: Saksham Sharda, Cody Baker."""
 from pathlib import Path
+from typing import Union, Tuple
+
 import numpy as np
-from typing import Union
+from tqdm import tqdm
 
 try:
     import cv2
@@ -13,50 +15,125 @@
 PathType = Union[str, Path]
 
 
-def get_movie_timestamps(movie_file: PathType):
-    """
-    Return numpy array of the timestamps for a movie file.
-
-    Parameters
-    ----------
-    movie_file : PathType
-    """
-    cap = cv2.VideoCapture(str(movie_file))
-    timestamps = [cap.get(cv2.CAP_PROP_POS_MSEC)]
-    success, frame = cap.read()
-    while success:
-        timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC))
-        success, frame = cap.read()
-    cap.release()
-    return np.array(timestamps)
-
-
-def get_movie_fps(movie_file: PathType):
-    """
-    Return the internal frames per second (fps) for a movie file.
-
-    Parameters
-    ----------
-    movie_file : PathType
-    """
-    cap = cv2.VideoCapture(str(movie_file))
-    if int((cv2.__version__).split(".")[0]) < 3:
-        fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
-    else:
-        fps = cap.get(cv2.CAP_PROP_FPS)
-    cap.release()
-    return fps
-
-
-def get_frame_shape(movie_file: PathType):
-    """
-    Return the shape of frames from a movie file.
-
-    Parameters
-    ----------
-    movie_file : PathType
-    """
-    cap = cv2.VideoCapture(str(movie_file))
-    success, frame = cap.read()
-    cap.release()
-    return frame.shape
+class VideoCaptureContext(cv2.VideoCapture):
+    def __init__(self, *args, stub=False, **kwargs):
+        self._args = args
+        self._kwargs = kwargs
+        super().__init__(*args, **kwargs)
+        self.stub = stub
+        self._current_frame = 0
+        self.frame_count = self.get_movie_frame_count()
+        self.fps = self.get_movie_fps()
+        self.frame = self.get_movie_frame(0)
+        assert self.frame is not None, "unable to read the movie file provided"
+
+    def get_movie_timestamps(self):
+        """
+        Return numpy array of the timestamps for a movie file.
+
+        """
+        if not self.isOpened():
+            raise ValueError("movie file is not open")
+        ts = [self.get(cv2.CAP_PROP_POS_MSEC)]
+        for i in tqdm(range(1, self.get_movie_frame_count()), desc="retrieving video timestamps"):
+            self.current_frame = i
+            ts.append(self.get(cv2.CAP_PROP_POS_MSEC))
+        self.current_frame = 0
+        return np.array(ts)
+
+    def get_movie_fps(self):
+        """
+        Return the internal frames per second (fps) for a movie file.
+
+        """
+        if int(cv2.__version__.split(".")[0]) < 3:
+            return self.get(cv2.cv.CV_CAP_PROP_FPS)
+        return self.get(cv2.CAP_PROP_FPS)
+
+    def get_frame_shape(self) -> Tuple:
+        """
+        Return the shape of frames from a movie file.
+        """
+        return self.frame.shape
+
+    def get_movie_frame_count(self):
+        """
+        Return the total number of frames for a movie file.
+
+        """
+        if self.stub:
+            # if stub the assume a max frame count of 10
+            return 10
+        if int(cv2.__version__.split(".")[0]) < 3:
+            count = self.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)
+        else:
+            count = self.get(cv2.CAP_PROP_FRAME_COUNT)
+        return int(count)
+
+    @property
+    def current_frame(self):
+        return self._current_frame
+
+    @current_frame.setter
+    def current_frame(self, frame_no):
+        if int(cv2.__version__.split(".")[0]) < 3:
+            set_arg = cv2.cv.CV_CAP_PROP_POS_FRAMES
+        else:
+            set_arg = cv2.CAP_PROP_POS_FRAMES
+        set_value = self.set(set_arg, frame_no)
+        if set_value:
+            self._current_frame = frame_no
+        else:
+            raise ValueError(f"could not set frame no {frame_no}")
+
+    def get_movie_frame(self, frame_no: int):
+        """
+        Return the specific frame from a movie.
+        """
+        if not self.isOpened():
+            raise ValueError("movie file is not open")
+        assert frame_no < self.get_movie_frame_count(), "frame number is greater than length of movie"
+        self.current_frame = frame_no
+        success, frame = self.read()
+        self.current_frame = 0
+        if success:
+            return frame
+        elif frame_no > 0:
+            return np.nan * np.ones(self.get_frame_shape())
+
+    def get_movie_frame_dtype(self):
+        """
+        Return the dtype for frame in a movie file.
+        """
+        return self.frame.dtype
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.isOpened():
+            raise StopIteration
+        try:
+            if self.current_frame < self.frame_count:
+                success, frame = self.read()
+                self.current_frame += 1
+                if success:
+                    return frame
+                else:
+                    return np.nan * np.ones(self.get_frame_shape())
+            else:
+                self.current_frame = 0
+                raise StopIteration
+        except Exception:
+            raise StopIteration
+
+    def __enter__(self):
+        if not self.isOpened():
+            super().__init__(*self._args, **self._kwargs)
+        return self
+
+    def __exit__(self, *args):
+        self.release()
+
+    def __del__(self):
+        self.release()
diff --git a/nwb_conversion_tools/datainterfaces/behavior/movie/moviedatainterface.py b/nwb_conversion_tools/datainterfaces/behavior/movie/moviedatainterface.py
@@ -11,12 +11,10 @@
 from hdmf.backends.hdf5.h5_utils import H5DataIO
 from hdmf.data_utils import DataChunkIterator
 
-from .movie_utils import get_movie_timestamps, get_movie_fps, get_frame_shape
 from ....basedatainterface import BaseDataInterface
 from ....utils.conversion_tools import check_regular_timestamps, get_module
 from ....utils.json_schema import get_schema_from_hdmf_class, get_base_schema
 
-
 try:
     import cv2
 
@@ -83,6 +81,7 @@ def run_conversion(
         chunk_data: bool = True,
         module_name: Optional[str] = None,
         module_description: Optional[str] = None,
+        compression: str = "gzip",
-        compression: str = "gzip",
+        compression: Optional[str] = "gzip",
+        compression_options: Optional[dict] = None,
-        compression: str = "gzip",
+        compression: Optional[str] = "gzip",
+        compression_options: Optional[dict] = None,
     ):
         """
         Convert the movie data files to ImageSeries and write them in the NWBFile.
@@ -105,6 +104,8 @@ def run_conversion(
             and may contain most keywords normally accepted by an ImageSeries
             (https://pynwb.readthedocs.io/en/stable/pynwb.image.html#pynwb.image.ImageSeries).
              The list for the 'Movies' key should correspond one to one to the movie files in the file_paths list.
+             If multiple movies need to be in the same ImageSeries, then supply the same value for "name" key.
+             Multiple movies in same ImageSeries is only supported if 'external_mode'=True.
         stub_test : bool, optional
             If True, truncates the write operation for fast testing. The default is False.
         external_mode : bool, optional
@@ -128,6 +129,8 @@ def run_conversion(
             If the processing module specified by module_name does not exist, it will be created with this description.
             The default description is the same as used by the conversion_tools.get_module function.
         """
+        from .movie_utils import VideoCaptureContext
+
         file_paths = self.source_data["file_paths"]
 
         if stub_test:
@@ -151,88 +154,81 @@ def run_conversion(
             f"({len(image_series_kwargs_list)}) vs. file_paths ({len(self.source_data['file_paths'])})!"
         )
 
-        for j, file in enumerate(file_paths):
-            timestamps = starting_times[j] + get_movie_timestamps(movie_file=file)
-
-            if len(starting_times) != len(file_paths):
-                starting_times.append(timestamps[-1])
+        # check for duplicates in image_series_kwargs list keys:
+        def _check_duplicates(image_series_kwargs_list):
+            image_series_kwargs_list_keys = [i["name"] for i in image_series_kwargs_list]
+            if len(set(image_series_kwargs_list_keys)) < len(image_series_kwargs_list_keys):
+                assert external_mode, "for multiple video files under the same ImageSeries name, use exernal_mode=True"
+            keys_set = []
+            image_series_kwargs_list_unique = []
+            for no, image_series_kwargs in enumerate(image_series_kwargs_list):
+                if image_series_kwargs["name"] not in keys_set:
+                    keys_set.append(image_series_kwargs["name"])
+                    image_series_kwargs_list_unique.append(dict(image_series_kwargs, data=[file_paths[no]]))
+                else:
+                    idx = keys_set.index(image_series_kwargs["name"])
+                    image_series_kwargs_list_unique[idx]["data"].append(file_paths[no])
+            return image_series_kwargs_list_unique
 
-            image_series_kwargs = dict(image_series_kwargs_list[j])
-            if check_regular_timestamps(ts=timestamps):
-                fps = get_movie_fps(movie_file=file)
-                image_series_kwargs.update(starting_time=starting_times[j], rate=fps)
-            else:
-                image_series_kwargs.update(timestamps=H5DataIO(timestamps, compression="gzip"))
+        image_series_kwargs_list_updated = _check_duplicates(image_series_kwargs_list)
 
+        for j, image_series_kwargs in enumerate(image_series_kwargs_list_updated):
+            file_list = image_series_kwargs.pop("data")
             if external_mode:
-                image_series_kwargs.update(format="external", external_file=[file])
+                image_series_kwargs.update(format="external", external_file=file_list)
+                with VideoCaptureContext(str(file_list[0]), stub=stub_test) as vc:
+                    fps = vc.get_movie_fps()
+                    image_series_kwargs.update(starting_time=0.0, rate=fps)  # TODO manage custom starting_times
             else:
+                file = file_list[0]
                 uncompressed_estimate = Path(file).stat().st_size * 70
                 available_memory = psutil.virtual_memory().available
-                if not chunk_data and uncompressed_estimate >= available_memory:
+                if not chunk_data and not stub_test and uncompressed_estimate >= available_memory:
                     warn(
                         f"Not enough memory (estimated {round(uncompressed_estimate/1e9, 2)} GB) to load movie file as "
                         f"array ({round(available_memory/1e9, 2)} GB available)! Forcing chunk_data to True."
                     )
                     chunk_data = True
-
-                total_frames = len(timestamps)
-                frame_shape = get_frame_shape(movie_file=file)
-                maxshape = [total_frames]
-                maxshape.extend(frame_shape)
-                best_gzip_chunk = (1, frame_shape[0], frame_shape[1], 3)
-                tqdm_pos, tqdm_mininterval = (0, 10)
+                video_capture_ob = VideoCaptureContext(str(file), stub=stub_test)
+                total_frames = video_capture_ob.get_movie_frame_count()
                 if chunk_data:
-
-                    def data_generator(file, count_max):
-                        cap = cv2.VideoCapture(str(file))
-                        for _ in range(min(count_max, total_frames)):
-                            success, frame = cap.read()
-                            yield frame
-                        cap.release()
-
-                    mov = DataChunkIterator(
-                        data=tqdm(
-                            iterable=data_generator(file=file, count_max=count_max),
-                            desc=f"Copying movie data for {Path(file).name}",
-                            position=tqdm_pos,
-                            total=total_frames,
-                            mininterval=tqdm_mininterval,
-                        ),
+                    frame_shape = video_capture_ob.get_frame_shape()
+                    maxshape = (total_frames, *frame_shape)
+                    best_gzip_chunk = (1, frame_shape[0], frame_shape[1], 3)
+                    iterable = DataChunkIterator.from_iterable(
+                        data=video_capture_ob,
                         iter_axis=0,  # nwb standard is time as zero axis
                         maxshape=tuple(maxshape),
+                        dtype=video_capture_ob.get_movie_frame_dtype(),
                     )
-                    image_series_kwargs.update(data=H5DataIO(mov, compression="gzip", chunks=best_gzip_chunk))
+                    data = H5DataIO(iterable, compression=compression, chunks=best_gzip_chunk)
                 else:
-                    cap = cv2.VideoCapture(str(file))
-                    mov = []
+                    iterable = []
+                    tqdm_pos, tqdm_mininterval = (0, 10)
                     with tqdm(
                         desc=f"Reading movie data for {Path(file).name}",
                         position=tqdm_pos,
                         total=total_frames,
                         mininterval=tqdm_mininterval,
                     ) as pbar:
-                        for _ in range(min(count_max, total_frames)):
-                            success, frame = cap.read()
-                            mov.append(frame)
+                        for frame in video_capture_ob:
+                            iterable.append(frame)
                             pbar.update(1)
-                    cap.release()
-                    image_series_kwargs.update(
-                        data=H5DataIO(
-                            DataChunkIterator(
-                                tqdm(
-                                    iterable=np.array(mov),
-                                    desc=f"Writing movie data for {Path(file).name}",
-                                    position=tqdm_pos,
-                                    mininterval=tqdm_mininterval,
-                                ),
-                                iter_axis=0,  # nwb standard is time as zero axis
-                                maxshape=tuple(maxshape),
-                            ),
-                            compression="gzip",
-                            chunks=best_gzip_chunk,
-                        )
-                    )
+                    iterable = np.array(iterable)
+
+                    data = H5DataIO(iterable, compression=compression)
+
+                # capture data in kwargs:
+                image_series_kwargs.update(data=data)
+                timestamps = starting_times[j] + video_capture_ob.get_movie_timestamps()
+                if len(starting_times) != len(file_paths):
+                    starting_times.append(timestamps[-1])
+                if check_regular_timestamps(ts=timestamps):
+                    fps = video_capture_ob.get_movie_fps()
+                    image_series_kwargs.update(starting_time=starting_times[j], rate=fps)
+                else:
+                    image_series_kwargs.update(timestamps=timestamps)
+
             if module_name is None:
                 nwbfile.add_acquisition(ImageSeries(**image_series_kwargs))
             else: