enh: Add con-duct ls

Fixes: #185 Note: Ignore pyout typing; +1 upstream, but fix too big pyout/pyout#142 pyout/pyout#151
con · Feb 6, 2025 · 896972a · 896972a
1 parent 50a711e
commit 896972a
Show file tree

Hide file tree

Showing 6 changed files with 223 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -139,12 +139,14 @@ usage: con-duct <command> [options]
 A suite of commands to manage or manipulate con-duct logs.
 
 positional arguments:
-  {pp,plot}   Available subcommands
-    pp        Pretty print a JSON log.
-    plot      Plot resource usage for an execution.
+  {pp,plot,ls}  Available subcommands
+    pp          Pretty print a JSON log.
+    plot        Plot resource usage for an execution.
+    ls          Print execution information for all runs matching
+                DUCT_OUTPUT_PREFIX.
 
 options:
-  -h, --help  show this help message and exit
+  -h, --help    show this help message and exit
 
 ```
 <!-- END EXTRAS HELP -->
diff --git a/setup.cfg b/setup.cfg
@@ -58,6 +58,8 @@ where = src
 [options.extras_require]
 all =
     matplotlib
+    PyYAML
+    pyout
 
 
 [options.entry_points]

diff --git a/src/con_duct/__main__.py b/src/con_duct/__main__.py
@@ -28,6 +28,9 @@
 lgr = logging.getLogger("con-duct")
 DEFAULT_LOG_LEVEL = os.environ.get("DUCT_LOG_LEVEL", "INFO").upper()
 
+DUCT_OUTPUT_PREFIX = os.getenv(
+    "DUCT_OUTPUT_PREFIX", ".duct/logs/{datetime_filesafe}-{pid}_"
+)
 ENV_PREFIXES = ("PBS_", "SLURM_", "OSG")
 SUFFIXES = {
     "stdout": "stdout",
@@ -712,9 +715,7 @@ def from_argv(
             "-p",
             "--output-prefix",
             type=str,
-            default=os.getenv(
-                "DUCT_OUTPUT_PREFIX", ".duct/logs/{datetime_filesafe}-{pid}_"
-            ),
+            default=DUCT_OUTPUT_PREFIX,
             help="File string format to be used as a prefix for the files -- the captured "
             "stdout and stderr and the resource usage logs. The understood variables are "
             "{datetime}, {datetime_filesafe}, and {pid}. "

diff --git a/src/con_duct/suite/ls.py b/src/con_duct/suite/ls.py
@@ -0,0 +1,167 @@
+import argparse
+from collections import OrderedDict
+import glob
+import json
+import logging
+from typing import List
+from packaging.version import Version
+
+try:
+    import pyout  # type: ignore
+except ImportError:
+    pyout = None
+import yaml
+from con_duct.__main__ import SummaryFormatter
+
+lgr = logging.getLogger(__name__)
+
+
+VALUE_TRANSFORMATION_MAP = {
+    "exit_code": "{value!E}",
+    "wall_clock_time": "{value:.3f} sec",
+    "peak_rss": "{value!S}",
+    "memory_total": "{value!S}",
+    "average_rss": "{value!S}",
+    "peak_vsz": "{value!S}",
+    "average_vsz": "{value!S}",
+    "peak_pmem": "{value:.2f!N}%",
+    "average_pmem": "{value:.2f!N}%",
+    "peak_pcpu": "{value:.2f!N}%",
+    "average_pcpu": "{value:.2f!N}%",
+    "start_time": "{value:.2f!N}",
+    "end_time": "{value:.2f!N}",
+}
+NON_TRANSFORMED_FIELDS = [
+    "hostname",
+    "uid",
+    "user",
+    "gpu",
+    "duct_version",
+    "schema_version",
+    "command",
+    "prefix",
+    "num_samples",
+    "num_reports",
+    "stderr",
+    "usage",
+    "info",
+    "prefix",
+]
+LS_FIELD_CHOICES = list(VALUE_TRANSFORMATION_MAP.keys()) + NON_TRANSFORMED_FIELDS
+MINIMUM_SCHEMA_VERSION = "0.2.0"
+
+
+def load_duct_runs(info_files: List[str]) -> List[dict]:
+    loaded = []
+    for info_file in info_files:
+        with open(info_file) as file:
+            try:
+                this = json.load(file)
+                # this["prefix"] is the path at execution time, could have moved
+                this["prefix"] = info_file.split("info.json")[0]
+                if Version(this["schema_version"]) >= Version(MINIMUM_SCHEMA_VERSION):
+                    loaded.append(this)
+                else:
+                    # TODO lower log level once --log-level is respected
+                    lgr.warning(
+                        f"Skipping {this['prefix']}, schema version {this['schema_version']} "
+                        f"is below minimum schema version {MINIMUM_SCHEMA_VERSION}."
+                    )
+            except Exception as exc:
+                lgr.warning("Failed to load file %s: %s", file, exc)
+    return loaded
+
+
+def process_run_data(
+    run_data_list: List[str], fields: List[str], formatter
+) -> List[OrderedDict]:
+    output_rows = []
+    for row in run_data_list:
+        flattened = _flatten_dict(row)
+        try:
+            restricted = _restrict_row(fields, flattened)
+        except KeyError:
+            lgr.warning(
+                "Failed to pick fields of interest from a record, skipping. Record was: %s",
+                list(flattened),
+            )
+            continue
+        formatted = _format_row(restricted, formatter)
+        output_rows.append(formatted)
+    return output_rows
+
+
+def _flatten_dict(d):
+    items = []
+    for k, v in d.items():
+        if isinstance(v, dict):
+            items.extend(_flatten_dict(v).items())
+        else:
+            items.append((k, v))
+    return dict(items)
+
+
+def _restrict_row(field_list, row):
+    restricted = OrderedDict()
+    # prefix is the "primary key", its the only field guaranteed to be unique.
+    restricted["prefix"] = row["prefix"]
+    for field in field_list:
+        if field != "prefix" and field in row:
+            restricted[field.split(".")[-1]] = row[field]
+    return restricted
+
+
+def _format_row(row, formatter):
+    transformed = OrderedDict()
+    for col, value in row.items():
+        if transformation := VALUE_TRANSFORMATION_MAP.get(col):
+            value = formatter.format(transformation, value=value)
+        transformed[col] = value
+    return transformed
+
+
+def pyout_ls(run_data_list):
+    # Generate Tabular table to output
+    with pyout.Tabular(
+        style=dict(
+            header_=dict(bold=True, transform=str.upper),
+        ),
+        mode="final",
+    ) as table:
+        for row in run_data_list:
+            table(row)
+
+
+def ls(args: argparse.Namespace) -> int:
+    info_files = []
+    for each in args.pattern:
+        matches = glob.glob(f"{each}_info.json")
+        info_files.extend(matches)
+    run_data_raw = load_duct_runs(info_files)
+    formatter = SummaryFormatter(enable_colors=args.colors)
+    output_rows = process_run_data(run_data_raw, args.fields, formatter)
+
+    if args.format == "auto":
+        args.format = "summaries" if pyout is None else "pyout"
+
+    if args.format == "summaries":
+        for row in output_rows:
+            for col, value in row.items():
+                if not col == "prefix":
+                    col = f"\t{col}"
+                print(f"{col.replace('_', ' ').title()}: {value}")
+    elif args.format == "pyout":
+        if pyout is None:
+            raise RuntimeError("Install pyout for pyout output")
+        pyout_ls(output_rows)
+    elif args.format == "json":
+        print(json.dumps(output_rows))
+    elif args.format == "json_pp":
+        print(json.dumps(output_rows, indent=True))
+    elif args.format == "yaml":
+        print(yaml.dump(output_rows, default_flow_style=False))
+    else:
+        raise RuntimeError(
+            f"Unexpected format encountered: {args.format}. This should have been caught by argparse.",
+        )
+    return 0
diff --git a/src/con_duct/suite/main.py b/src/con_duct/suite/main.py
@@ -1,6 +1,9 @@
 import argparse
+import os
 import sys
 from typing import List, Optional
+from con_duct.__main__ import DUCT_OUTPUT_PREFIX
+from con_duct.suite.ls import LS_FIELD_CHOICES, ls
 from con_duct.suite.plot import matplotlib_plot
 from con_duct.suite.pprint_json import pprint_json
 
@@ -46,6 +49,46 @@ def main(argv: Optional[List[str]] = None) -> None:
     # )
     parser_plot.set_defaults(func=matplotlib_plot)
 
+    parser_ls = subparsers.add_parser(
+        "ls",
+        help="Print execution information for all runs matching DUCT_OUTPUT_PREFIX.",
+    )
+    parser_ls.add_argument(
+        "-f",
+        "--format",
+        choices=("auto", "pyout", "summaries", "json", "json_pp", "yaml"),
+        default="auto",
+        help="Output format. 'auto' chooses 'pyout' if pyout library is installed, 'summaries' otherwise.",
+    )
+    parser_ls.add_argument(
+        "-F",
+        "--fields",
+        nargs="+",
+        metavar="FIELD",
+        help=f"List of fields to show. Prefix is always included implicitly as the first field. "
+        f"Available choices: {', '.join(LS_FIELD_CHOICES)}.",
+        choices=LS_FIELD_CHOICES,
+        default=[
+            "command",
+            "exit_code",
+            "wall_clock_time",
+            "peak_rss",
+        ],
+    )
+    parser_ls.add_argument(
+        "--colors",
+        action="store_true",
+        default=os.getenv("DUCT_COLORS", False),
+        help="Use colors in duct output.",
+    )
+    parser_ls.add_argument(
+        "pattern",
+        nargs="*",
+        default=[f"{DUCT_OUTPUT_PREFIX[:DUCT_OUTPUT_PREFIX.index('{')]}*"],
+        help="Path(s) to list, supports globbing (defaults to the non-dynamic portion of DUCT_OUTPUT_PREFIX",
+    )
+    parser_ls.set_defaults(func=ls)
+
     args = parser.parse_args(argv)
 
     if args.command is None:

diff --git a/tox.ini b/tox.ini
@@ -26,6 +26,7 @@ commands =
 deps =
     mypy
     data-science-types  # TODO replace archived, https://github.com/wearepal/data-science-types
+    types-PyYAML
     {[testenv]deps}
 commands =
     mypy src test
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,8 @@ where = src @@
     [options.extras_require]
     all =
         matplotlib
+        PyYAML
+        pyout
     [options.entry_points]
@@ Expand Down @@