LAAC-LSCP · lucasgautheron · Sep 11, 2021 · Jul 25, 2022
diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py
@@ -482,29 +482,32 @@ def explain(args):
 )
 def compute_durations(args):
     """creates a 'duration' column into metadata/recordings. duration is in ms"""
-    project = ChildProject(args.source)
+    project = ChildProject(args.source, primary_metadata_only=True)
 
     perform_validation(project, require_success=True, ignore_recordings=True)
 
-    if "duration" in project.recordings.columns:
+    recordings = project.recordings.copy()
+    recordings.set_index("recording_filename", inplace=True)
+    columns = recordings.columns.copy()
+
+    if "duration" in columns:
         if not args.force:
             print("duration exists, aborting")
             return
+    else:
+        recordings["duration"] = 0
+        columns.append("duration")
 
-        project.recordings.drop(columns=["duration"], inplace=True)
+    durations = project.compute_recordings_duration(profile=args.profile).set_index(
+        "recording_filename"
+    )
 
-    durations = project.compute_recordings_duration(profile=args.profile).dropna()
+    recordings.update(durations)
 
-    recordings = project.recordings.merge(
-        durations[durations["recording_filename"] != "NA"],
-        how="left",
-        left_on="recording_filename",
-        right_on="recording_filename",
-    )
     recordings["duration"].fillna(0, inplace=True)
     recordings["duration"] = recordings["duration"].astype(int)
-    recordings.to_csv(
-        os.path.join(project.path, "metadata/recordings.csv"), index=False
+    recordings[columns].to_csv(
+        os.path.join(project.path, "metadata/recordings.csv")
     )
 
 @subcommand(

diff --git a/ChildProject/pipelines/samplers.py b/ChildProject/pipelines/samplers.py
@@ -236,8 +236,8 @@ def _sample(self):
 
             durations = self.project.compute_recordings_duration(self.profile).dropna()
             recordings = recordings.merge(
-                durations[durations["recording_filename"] != "NA"],
-                how="left",
+                durations,
+                how="inner",
                 left_on="recording_filename",
                 right_on="recording_filename",
             )

diff --git a/ChildProject/projects.py b/ChildProject/projects.py
@@ -266,11 +266,17 @@ class ChildProject:
     PROJECT_FOLDERS = ["recordings", "annotations", "metadata", "doc", "scripts"]
 
     def __init__(
-        self, path: str, enforce_dtypes: bool = False, ignore_discarded: bool = False
+        self, path: str, enforce_dtypes: bool = False, ignore_discarded: bool = False, primary_metadata_only: bool = False
     ):
+        """Constructor
+
+        :param path: path to the root of the dataset.
+        :type path: str
+        """
         self.path = path
         self.enforce_dtypes = enforce_dtypes
         self.ignore_discarded = ignore_discarded
+        self.primary_metadata_only = primary_metadata_only
 
         self.errors = []
         self.warnings = []
@@ -374,25 +380,26 @@ def read(self, verbose=False):
         self.recordings = self.rt.read()
 
         # accumulate additional metadata (optional)
-        self.ct.df = self.accumulate_metadata(
-            "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
-        )
-        self.rt.df = self.accumulate_metadata(
-            "recordings",
-            self.recordings,
-            self.RECORDINGS_COLUMNS,
-            "recording_filename",
-            verbose,
-        )
+        if not self.primary_metadata_only:
+            self.ct.df = self.accumulate_metadata(
+                "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
+            )
+            self.rt.df = self.accumulate_metadata(
+                "recordings",
+                self.recordings,
+                self.RECORDINGS_COLUMNS,
+                "recording_filename",
+                verbose,
+            )
 
-        if self.ignore_discarded and "discard" in self.ct.df:
-            self.ct.df = self.ct.df[self.ct.df["discard"].astype(str) == "1"]
+            if self.ignore_discarded and "discard" in self.ct.df:
+                self.ct.df = self.ct.df[self.ct.df["discard"].astype(str) == "1"]
 
-        if self.ignore_discarded and "discard" in self.rt.df:
-            self.rt.df = self.rt.df[self.rt.df["discard"].astype(str) == "1"]
+            if self.ignore_discarded and "discard" in self.rt.df:
+                self.rt.df = self.rt.df[self.rt.df["discard"].astype(str) == "1"]
 
-        self.children = self.ct.df
-        self.recordings = self.rt.df
+            self.children = self.ct.df
+            self.recordings = self.rt.df
 
     def validate(self, ignore_recordings: bool = False, profile: str = None) -> tuple:
         """Validate a dataset, returning all errors and warnings.
@@ -651,7 +658,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
 
         :param profile: name of the profile of recordings to compute the duration from. If None, raw recordings are used. defaults to None
         :type profile: str, optional
-        :return: dataframe of the recordings, with an additional/updated duration columns.
+        :return: dataframe of the recordings, with an additional/updated duration columns. drops recordings for which the duration could not be retrieved.
         :rtype: pd.DataFrame
         """
         recordings = self.recordings[["recording_filename"]]
@@ -661,7 +668,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
                 lambda f: get_audio_duration(self.get_recording_path(f, profile))
             )
         )
-        recordings["duration"].fillna(0, inplace=True)
+        recordings.dropna(inplace=True)
         recordings["duration"] = (recordings["duration"] * 1000).astype(int)
 
         return recordings

diff --git a/ChildProject/utils.py b/ChildProject/utils.py
@@ -99,9 +99,9 @@ def get_audio_duration(filename):
     import sox
 
     if not os.path.exists(filename):
-        return 0
+        return None
 
-    duration = 0
+    duration = None
     try:
         duration = sox.file_info.duration(filename)
     except: