From f4701823c83103d78f00cb2ba336c83e060a68f6 Mon Sep 17 00:00:00 2001 From: Lucas Gautheron Date: Sat, 11 Sep 2021 18:18:35 +0200 Subject: [PATCH] compute-durations should be usable even when not all recordings are locally available --- ChildProject/cmdline.py | 27 +++++++++++++----------- ChildProject/pipelines/samplers.py | 4 ++-- ChildProject/projects.py | 33 ++++++++++++++++-------------- ChildProject/utils.py | 4 ++-- 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index 231af0f83..0b45fec9d 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -430,7 +430,7 @@ def overview(args): ) def compute_durations(args): """creates a 'duration' column into metadata/recordings""" - project = ChildProject(args.source) + project = ChildProject(args.source, primary_metadata_only=True) errors, warnings = project.validate() @@ -441,25 +441,28 @@ def compute_durations(args): ) print("trying to pursue anyway, but expect failures") - if "duration" in project.recordings.columns: + recordings = project.recordings.copy() + recordings.set_index("recording_filename", inplace=True) + columns = recordings.columns.copy() + + if "duration" in columns: if not args.force: print("duration exists, aborting") return + else: + recordings["duration"] = 0 + columns.append("duration") - project.recordings.drop(columns=["duration"], inplace=True) + durations = project.compute_recordings_duration(profile=args.profile).set_index( + "recording_filename" + ) - durations = project.compute_recordings_duration(profile=args.profile).dropna() + recordings.update(durations) - recordings = project.recordings.merge( - durations[durations["recording_filename"] != "NA"], - how="left", - left_on="recording_filename", - right_on="recording_filename", - ) recordings["duration"].fillna(0, inplace=True) recordings["duration"] = recordings["duration"].astype(int) - recordings.to_csv( - os.path.join(project.path, "metadata/recordings.csv"), index=False + recordings[columns].to_csv( + os.path.join(project.path, "metadata/recordings.csv") ) diff --git a/ChildProject/pipelines/samplers.py b/ChildProject/pipelines/samplers.py index 5bbeac9b5..3a2e9251d 100644 --- a/ChildProject/pipelines/samplers.py +++ b/ChildProject/pipelines/samplers.py @@ -239,8 +239,8 @@ def _sample(self): durations = self.project.compute_recordings_duration(self.profile).dropna() recordings = recordings.merge( - durations[durations["recording_filename"] != "NA"], - how="left", + durations, + how="inner", left_on="recording_filename", right_on="recording_filename", ) diff --git a/ChildProject/projects.py b/ChildProject/projects.py index c431d500c..04408ba8d 100644 --- a/ChildProject/projects.py +++ b/ChildProject/projects.py @@ -207,13 +207,15 @@ class ChildProject: PROJECT_FOLDERS = ["recordings", "annotations", "metadata", "doc", "scripts"] - def __init__(self, path: str): + def __init__(self, path: str, primary_metadata_only: bool = False): """Constructor :param path: path to the root of the dataset. :type path: str """ self.path = path + self.primary_metadata_only = primary_metadata_only + self.errors = [] self.warnings = [] self.children = None @@ -314,19 +316,20 @@ def read(self, verbose=False): self.recordings = self.rt.read() # accumulate additional metadata (optional) - self.ct.df = self.accumulate_metadata( - "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose - ) - self.rt.df = self.accumulate_metadata( - "recordings", - self.recordings, - self.RECORDINGS_COLUMNS, - "recording_filename", - verbose, - ) + if not self.primary_metadata_only: + self.ct.df = self.accumulate_metadata( + "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose + ) + self.rt.df = self.accumulate_metadata( + "recordings", + self.recordings, + self.RECORDINGS_COLUMNS, + "recording_filename", + verbose, + ) - self.children = self.ct.df - self.recordings = self.rt.df + self.children = self.ct.df + self.recordings = self.rt.df def validate(self, ignore_files: bool = False) -> tuple: """Validate a dataset, returning all errors and warnings. @@ -536,7 +539,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame: :param profile: name of the profile of recordings to compute the duration from. If None, raw recordings are used. defaults to None :type profile: str, optional - :return: dataframe of the recordings, with an additional/updated duration columns. + :return: dataframe of the recordings, with an additional/updated duration columns. drops recordings for which the duration could not be retrieved. :rtype: pd.DataFrame """ recordings = self.recordings[["recording_filename"]] @@ -546,7 +549,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame: lambda f: get_audio_duration(self.get_recording_path(f, profile)) ) ) - recordings["duration"].fillna(0, inplace=True) + recordings.dropna(inplace=True) recordings["duration"] = (recordings["duration"] * 1000).astype(int) return recordings diff --git a/ChildProject/utils.py b/ChildProject/utils.py index 9b764433c..baaafff0b 100644 --- a/ChildProject/utils.py +++ b/ChildProject/utils.py @@ -50,9 +50,9 @@ def get_audio_duration(filename): import sox if not os.path.exists(filename): - return 0 + return None - duration = 0 + duration = None try: duration = sox.file_info.duration(filename) except: