From f4701823c83103d78f00cb2ba336c83e060a68f6 Mon Sep 17 00:00:00 2001
From: Lucas Gautheron <lucas.gautheron@gmail.com>
Date: Sat, 11 Sep 2021 18:18:35 +0200
Subject: [PATCH] compute-durations should be usable even when not all
 recordings are locally available

---
 ChildProject/cmdline.py            | 27 +++++++++++++-----------
 ChildProject/pipelines/samplers.py |  4 ++--
 ChildProject/projects.py           | 33 ++++++++++++++++--------------
 ChildProject/utils.py              |  4 ++--
 4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py
index 231af0f83..0b45fec9d 100755
--- a/ChildProject/cmdline.py
+++ b/ChildProject/cmdline.py
@@ -430,7 +430,7 @@ def overview(args):
 )
 def compute_durations(args):
     """creates a 'duration' column into metadata/recordings"""
-    project = ChildProject(args.source)
+    project = ChildProject(args.source, primary_metadata_only=True)
 
     errors, warnings = project.validate()
 
@@ -441,25 +441,28 @@ def compute_durations(args):
         )
         print("trying to pursue anyway, but expect failures")
 
-    if "duration" in project.recordings.columns:
+    recordings = project.recordings.copy()
+    recordings.set_index("recording_filename", inplace=True)
+    columns = recordings.columns.copy()
+
+    if "duration" in columns:
         if not args.force:
             print("duration exists, aborting")
             return
+    else:
+        recordings["duration"] = 0
+        columns.append("duration")
 
-        project.recordings.drop(columns=["duration"], inplace=True)
+    durations = project.compute_recordings_duration(profile=args.profile).set_index(
+        "recording_filename"
+    )
 
-    durations = project.compute_recordings_duration(profile=args.profile).dropna()
+    recordings.update(durations)
 
-    recordings = project.recordings.merge(
-        durations[durations["recording_filename"] != "NA"],
-        how="left",
-        left_on="recording_filename",
-        right_on="recording_filename",
-    )
     recordings["duration"].fillna(0, inplace=True)
     recordings["duration"] = recordings["duration"].astype(int)
-    recordings.to_csv(
-        os.path.join(project.path, "metadata/recordings.csv"), index=False
+    recordings[columns].to_csv(
+        os.path.join(project.path, "metadata/recordings.csv")
     )
 
 
diff --git a/ChildProject/pipelines/samplers.py b/ChildProject/pipelines/samplers.py
index 5bbeac9b5..3a2e9251d 100644
--- a/ChildProject/pipelines/samplers.py
+++ b/ChildProject/pipelines/samplers.py
@@ -239,8 +239,8 @@ def _sample(self):
 
             durations = self.project.compute_recordings_duration(self.profile).dropna()
             recordings = recordings.merge(
-                durations[durations["recording_filename"] != "NA"],
-                how="left",
+                durations,
+                how="inner",
                 left_on="recording_filename",
                 right_on="recording_filename",
             )
diff --git a/ChildProject/projects.py b/ChildProject/projects.py
index c431d500c..04408ba8d 100644
--- a/ChildProject/projects.py
+++ b/ChildProject/projects.py
@@ -207,13 +207,15 @@ class ChildProject:
 
     PROJECT_FOLDERS = ["recordings", "annotations", "metadata", "doc", "scripts"]
 
-    def __init__(self, path: str):
+    def __init__(self, path: str, primary_metadata_only: bool = False):
         """Constructor
 
         :param path: path to the root of the dataset.
         :type path: str
         """
         self.path = path
+        self.primary_metadata_only = primary_metadata_only
+
         self.errors = []
         self.warnings = []
         self.children = None
@@ -314,19 +316,20 @@ def read(self, verbose=False):
         self.recordings = self.rt.read()
 
         # accumulate additional metadata (optional)
-        self.ct.df = self.accumulate_metadata(
-            "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
-        )
-        self.rt.df = self.accumulate_metadata(
-            "recordings",
-            self.recordings,
-            self.RECORDINGS_COLUMNS,
-            "recording_filename",
-            verbose,
-        )
+        if not self.primary_metadata_only:
+            self.ct.df = self.accumulate_metadata(
+                "children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
+            )
+            self.rt.df = self.accumulate_metadata(
+                "recordings",
+                self.recordings,
+                self.RECORDINGS_COLUMNS,
+                "recording_filename",
+                verbose,
+            )
 
-        self.children = self.ct.df
-        self.recordings = self.rt.df
+            self.children = self.ct.df
+            self.recordings = self.rt.df
 
     def validate(self, ignore_files: bool = False) -> tuple:
         """Validate a dataset, returning all errors and warnings.
@@ -536,7 +539,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
 
         :param profile: name of the profile of recordings to compute the duration from. If None, raw recordings are used. defaults to None
         :type profile: str, optional
-        :return: dataframe of the recordings, with an additional/updated duration columns.
+        :return: dataframe of the recordings, with an additional/updated duration columns. drops recordings for which the duration could not be retrieved.
         :rtype: pd.DataFrame
         """
         recordings = self.recordings[["recording_filename"]]
@@ -546,7 +549,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
                 lambda f: get_audio_duration(self.get_recording_path(f, profile))
             )
         )
-        recordings["duration"].fillna(0, inplace=True)
+        recordings.dropna(inplace=True)
         recordings["duration"] = (recordings["duration"] * 1000).astype(int)
 
         return recordings
diff --git a/ChildProject/utils.py b/ChildProject/utils.py
index 9b764433c..baaafff0b 100644
--- a/ChildProject/utils.py
+++ b/ChildProject/utils.py
@@ -50,9 +50,9 @@ def get_audio_duration(filename):
     import sox
 
     if not os.path.exists(filename):
-        return 0
+        return None
 
-    duration = 0
+    duration = None
     try:
         duration = sox.file_info.duration(filename)
     except: