From 90946b7eab0ea54b9af80b0a35602c8f15b4ef3d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 5 Sep 2024 16:06:03 -0400 Subject: [PATCH 1/4] organize: do not parallelize for a single path --- dandi/organize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/organize.py b/dandi/organize.py index c7217437c..5d7f09b6c 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -859,7 +859,7 @@ def _get_metadata(path): return meta if ( - not devel_debug and jobs != 1 + not devel_debug and jobs != 1 and not len(paths) == 1 ): # Do not use joblib at all if number_of_jobs=1 # Note: It is Python (pynwb) intensive, not IO, so ATM there is little # to no benefit from Parallel without using multiproc! But that would From 6b579c04e872f20aaae671a669b32ed2449fed37 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 5 Sep 2024 16:31:20 -0400 Subject: [PATCH 2/4] Return and log exception information while loading metadata for organize joblib does not setup automagically some kind of logging for Parallel. Filed dedicated to possibly see it implemented - https://github.com/dandi/dandi-cli/issues/1495 For the sake of current use case (e.g. troubleshooting https://github.com/dandi/dandi-cli/issues/1494) it should largely suffice to return and log information about exception which was raised while loading metadata. This is what is done in this PR and while using buggy hdmf we do get nice logging in the log file at DEBUG level. No attempts were made to reduce possibly a flood of duplicate log messages since per file metadata would have unique values --- dandi/organize.py | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/dandi/organize.py b/dandi/organize.py index 5d7f09b6c..1f455a3b8 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -13,6 +13,7 @@ import os.path as op from pathlib import Path, PurePosixPath import re +import traceback import uuid import ruamel.yaml @@ -841,22 +842,25 @@ def act(func, *args, **kwargs): # react to those # Doesn't play nice with Parallel # with tqdm.tqdm(desc="Files", total=len(paths), unit="file", unit_scale=False) as pbar: - failed = [] def _get_metadata(path): # Avoid heavy import by importing within function: from .metadata.nwb import get_metadata + meta, exc = {}, None try: meta = get_metadata(path) - except Exception as exc: - meta = {} - failed.append(path) - # pbar.desc = "Files (%d failed)" % len(failed) - lgr.debug("Failed to get metadata for %s: %s", path, exc) + except Exception as e: + exc = ( + e.__class__, + str(e), + traceback.TracebackException.from_exception(e), + ) + # meta = {} + # lgr.debug("Failed to get metadata for %s: %s", path, exc) # pbar.update(1) meta["path"] = path - return meta + return meta, exc if ( not devel_debug and jobs != 1 and not len(paths) == 1 @@ -864,21 +868,33 @@ def _get_metadata(path): # Note: It is Python (pynwb) intensive, not IO, so ATM there is little # to no benefit from Parallel without using multiproc! But that would # complicate progress bar indication... TODO - metadata = list( + metadata_excs = list( Parallel(n_jobs=jobs, verbose=10)( delayed(_get_metadata)(path) for path in paths ) ) else: - metadata = list(map(_get_metadata, paths)) - if failed: + metadata_excs = list(map(_get_metadata, paths)) + exceptions = [e for _, e in metadata_excs if e] + if exceptions: lgr.warning( - "Failed to load metadata for %d out of %d files", - len(failed), + "Failed to load metadata for %d out of %d files " + "due to following types of exceptions: %s. " + "Details of the exceptions will be shown at DEBUG level", + len(exceptions), len(paths), + ", ".join(e[0].__name__ for e in exceptions), ) + for m, e in metadata_excs: + if not e: + continue + lgr.debug( + "Loading metadata for path %s resulted in following exception:\n%s", + m["path"], + "\n".join(e[-1].format()), + ) - metadata, skip_invalid = filter_invalid_metadata_rows(metadata) + metadata, skip_invalid = filter_invalid_metadata_rows([m for m, e in metadata_excs]) if skip_invalid: msg = ( "%d out of %d files were found not containing all necessary " From 78b9beeb031595e446f83cda7bc1d63e9fefa076 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 5 Sep 2024 16:34:04 -0400 Subject: [PATCH 3/4] Do not allow for HDMF 3.14.4 with regression --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 773264b1b..174b8632e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,8 @@ install_requires = etelemetry >= 0.2.2 fasteners fscacher >= 0.3.0 - hdmf != 3.5.0 + # 3.14.4: https://github.com/hdmf-dev/hdmf/issues/1186 + hdmf != 3.5.0,!=3.14.4 humanize interleave ~= 0.1 joblib From 1aa3df8b7fbe19f6edcaa1f36524598f720ffef0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 5 Sep 2024 21:28:50 -0400 Subject: [PATCH 4/4] Apply suggestions from code review: kill commented out + use _ for unused var Co-authored-by: Austin Macdonald --- dandi/organize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dandi/organize.py b/dandi/organize.py index 1f455a3b8..573b709d3 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -856,8 +856,6 @@ def _get_metadata(path): str(e), traceback.TracebackException.from_exception(e), ) - # meta = {} - # lgr.debug("Failed to get metadata for %s: %s", path, exc) # pbar.update(1) meta["path"] = path return meta, exc @@ -894,7 +892,7 @@ def _get_metadata(path): "\n".join(e[-1].format()), ) - metadata, skip_invalid = filter_invalid_metadata_rows([m for m, e in metadata_excs]) + metadata, skip_invalid = filter_invalid_metadata_rows([m for m, _ in metadata_excs]) if skip_invalid: msg = ( "%d out of %d files were found not containing all necessary "