Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compute-durations update #280

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions ChildProject/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,29 +482,32 @@ def explain(args):
)
def compute_durations(args):
"""creates a 'duration' column into metadata/recordings. duration is in ms"""
project = ChildProject(args.source)
project = ChildProject(args.source, primary_metadata_only=True)

perform_validation(project, require_success=True, ignore_recordings=True)

if "duration" in project.recordings.columns:
recordings = project.recordings.copy()
recordings.set_index("recording_filename", inplace=True)
columns = recordings.columns.copy()

if "duration" in columns:
if not args.force:
print("duration exists, aborting")
return
else:
recordings["duration"] = 0
columns.append("duration")

project.recordings.drop(columns=["duration"], inplace=True)
durations = project.compute_recordings_duration(profile=args.profile).set_index(
"recording_filename"
)

durations = project.compute_recordings_duration(profile=args.profile).dropna()
recordings.update(durations)

recordings = project.recordings.merge(
durations[durations["recording_filename"] != "NA"],
how="left",
left_on="recording_filename",
right_on="recording_filename",
)
recordings["duration"].fillna(0, inplace=True)
recordings["duration"] = recordings["duration"].astype(int)
recordings.to_csv(
os.path.join(project.path, "metadata/recordings.csv"), index=False
recordings[columns].to_csv(
os.path.join(project.path, "metadata/recordings.csv")
)

@subcommand(
Expand Down
4 changes: 2 additions & 2 deletions ChildProject/pipelines/samplers.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ def _sample(self):

durations = self.project.compute_recordings_duration(self.profile).dropna()
recordings = recordings.merge(
durations[durations["recording_filename"] != "NA"],
how="left",
durations,
how="inner",
left_on="recording_filename",
right_on="recording_filename",
)
Expand Down
45 changes: 26 additions & 19 deletions ChildProject/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,17 @@ class ChildProject:
PROJECT_FOLDERS = ["recordings", "annotations", "metadata", "doc", "scripts"]

def __init__(
self, path: str, enforce_dtypes: bool = False, ignore_discarded: bool = False
self, path: str, enforce_dtypes: bool = False, ignore_discarded: bool = False, primary_metadata_only: bool = False
):
"""Constructor

:param path: path to the root of the dataset.
:type path: str
"""
self.path = path
self.enforce_dtypes = enforce_dtypes
self.ignore_discarded = ignore_discarded
self.primary_metadata_only = primary_metadata_only

self.errors = []
self.warnings = []
Expand Down Expand Up @@ -374,25 +380,26 @@ def read(self, verbose=False):
self.recordings = self.rt.read()

# accumulate additional metadata (optional)
self.ct.df = self.accumulate_metadata(
"children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
)
self.rt.df = self.accumulate_metadata(
"recordings",
self.recordings,
self.RECORDINGS_COLUMNS,
"recording_filename",
verbose,
)
if not self.primary_metadata_only:
self.ct.df = self.accumulate_metadata(
"children", self.children, self.CHILDREN_COLUMNS, "child_id", verbose
)
self.rt.df = self.accumulate_metadata(
"recordings",
self.recordings,
self.RECORDINGS_COLUMNS,
"recording_filename",
verbose,
)

if self.ignore_discarded and "discard" in self.ct.df:
self.ct.df = self.ct.df[self.ct.df["discard"].astype(str) == "1"]
if self.ignore_discarded and "discard" in self.ct.df:
self.ct.df = self.ct.df[self.ct.df["discard"].astype(str) == "1"]

if self.ignore_discarded and "discard" in self.rt.df:
self.rt.df = self.rt.df[self.rt.df["discard"].astype(str) == "1"]
if self.ignore_discarded and "discard" in self.rt.df:
self.rt.df = self.rt.df[self.rt.df["discard"].astype(str) == "1"]

self.children = self.ct.df
self.recordings = self.rt.df
self.children = self.ct.df
self.recordings = self.rt.df

def validate(self, ignore_recordings: bool = False, profile: str = None) -> tuple:
"""Validate a dataset, returning all errors and warnings.
Expand Down Expand Up @@ -651,7 +658,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:

:param profile: name of the profile of recordings to compute the duration from. If None, raw recordings are used. defaults to None
:type profile: str, optional
:return: dataframe of the recordings, with an additional/updated duration columns.
:return: dataframe of the recordings, with an additional/updated duration columns. drops recordings for which the duration could not be retrieved.
:rtype: pd.DataFrame
"""
recordings = self.recordings[["recording_filename"]]
Expand All @@ -661,7 +668,7 @@ def compute_recordings_duration(self, profile: str = None) -> pd.DataFrame:
lambda f: get_audio_duration(self.get_recording_path(f, profile))
)
)
recordings["duration"].fillna(0, inplace=True)
recordings.dropna(inplace=True)
recordings["duration"] = (recordings["duration"] * 1000).astype(int)

return recordings
Expand Down
4 changes: 2 additions & 2 deletions ChildProject/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def get_audio_duration(filename):
import sox

if not os.path.exists(filename):
return 0
return None

duration = 0
duration = None
try:
duration = sox.file_info.duration(filename)
except:
Expand Down