From d85a351023f77310460c06c0051499e25839890e Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Fri, 27 Dec 2024 14:33:10 +0200
Subject: [PATCH] Compute and track detailed evaluation metrics on each epoch
 (#385)

* save jet metrics on each epoch

* format

* fix ray
---
 mlpf/model/PFDataset.py                    |   7 +-
 mlpf/model/distributed_ray.py              |   3 +-
 mlpf/model/inference.py                    |  12 +-
 mlpf/model/mlpf.py                         |   2 +-
 mlpf/model/training.py                     | 230 ++++++++++++---------
 mlpf/model/utils.py                        |   6 +-
 mlpf/pipeline.py                           |  11 +-
 mlpf/plotting/plot_utils.py                |  27 +++
 parameters/pytorch/pyg-clic.yaml           |   6 +-
 parameters/pytorch/pyg-cms-ttbar-nopu.yaml |   9 +-
 parameters/pytorch/pyg-cms.yaml            |   6 +-
 11 files changed, 195 insertions(+), 124 deletions(-)

diff --git a/mlpf/model/PFDataset.py b/mlpf/model/PFDataset.py
index d1934cee9..c0ce21527 100644
--- a/mlpf/model/PFDataset.py
+++ b/mlpf/model/PFDataset.py
@@ -82,13 +82,16 @@ def __getitem__(self, item):
             ret["X"][:, 1][msk_ho] = np.sqrt(e**2 - (np.tanh(eta) * e) ** 2)
 
         # transform pt -> log(pt / elem pt), same for energy
-        target_pt = np.log(ret["ytarget"][:, 2] / ret["X"][:, 1])
+        # where target does not exist, set to 0
+        with np.errstate(divide="ignore"):
+            target_pt = np.log(ret["ytarget"][:, 2] / ret["X"][:, 1])
         target_pt[np.isnan(target_pt)] = 0
         target_pt[np.isinf(target_pt)] = 0
         ret["ytarget_pt_orig"] = ret["ytarget"][:, 2].copy()
         ret["ytarget"][:, 2] = target_pt
 
-        target_e = np.log(ret["ytarget"][:, 6] / ret["X"][:, 5])
+        with np.errstate(divide="ignore"):
+            target_e = np.log(ret["ytarget"][:, 6] / ret["X"][:, 5])
         target_e[ret["ytarget"][:, 0] == 0] = 0
         target_e[np.isnan(target_e)] = 0
         target_e[np.isinf(target_e)] = 0
diff --git a/mlpf/model/distributed_ray.py b/mlpf/model/distributed_ray.py
index 56e86ba49..d6202e75c 100644
--- a/mlpf/model/distributed_ray.py
+++ b/mlpf/model/distributed_ray.py
@@ -252,7 +252,7 @@ def train_ray_trial(config, args, outdir=None):
         _logger.info(table)
 
     if (rank == 0) or (rank == "cpu"):
-        save_HPs(args, model, model_kwargs, outdir)  # save model_kwargs and hyperparameters
+        save_HPs(config, model, model_kwargs, outdir)  # save model_kwargs and hyperparameters
         _logger.info("Creating experiment dir {}".format(outdir))
         _logger.info(f"Model directory {outdir}", color="bold")
 
@@ -312,6 +312,7 @@ def train_ray_trial(config, args, outdir=None):
         config["num_epochs"],
         config["patience"],
         outdir,
+        config,
         trainable=config["model"]["trainable"],
         start_epoch=start_epoch,
         lr_schedule=lr_schedule,
diff --git a/mlpf/model/inference.py b/mlpf/model/inference.py
index 0d5d06d23..039769f7a 100644
--- a/mlpf/model/inference.py
+++ b/mlpf/model/inference.py
@@ -155,13 +155,15 @@ def run_predictions(world_size, rank, model, loader, sample, outpath, jetdef, je
     ti = time.time()
     for i, batch in iterator:
         predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample)
+    tf = time.time()
+    time_total_min = (tf - ti) / 60.0
 
-    _logger.info(f"Time taken to make predictions on device {rank} is: {((time.time() - ti) / 60):.2f} min")
+    _logger.info(f"Time taken to make predictions on device {rank} is: {time_total_min:.2f} min")
 
 
 def make_plots(outpath, sample, dataset, dir_name="", ntest_files=-1):
-    """Uses the predictions stored as .parquet files (see above) to make plots."""
-
+    """Uses the predictions stored as .parquet files from run_predictions to make plots."""
+    ret_dict = {}
     mplhep.style.use(mplhep.styles.CMS)
     class_names = get_class_names(sample)
     os.system(f"mkdir -p {outpath}/plots{dir_name}/{sample}")
@@ -181,7 +183,7 @@ def make_plots(outpath, sample, dataset, dir_name="", ntest_files=-1):
         dataset=dataset,
         sample=sample,
     )
-    plot_jet_ratio(
+    ret_dict["jet_ratio"] = plot_jet_ratio(
         yvals,
         cp_dir=plots_path,
         bins=np.linspace(0, 5, 500),
@@ -230,3 +232,5 @@ def make_plots(outpath, sample, dataset, dir_name="", ntest_files=-1):
     plot_particles(yvals, cp_dir=plots_path, dataset=dataset, sample=sample)
     plot_particle_ratio(yvals, class_names, cp_dir=plots_path, dataset=dataset, sample=sample)
     plot_particle_response(X, yvals, class_names, cp_dir=plots_path, dataset=dataset, sample=sample)
+
+    return ret_dict
diff --git a/mlpf/model/mlpf.py b/mlpf/model/mlpf.py
index 8722f9457..c02d039dd 100644
--- a/mlpf/model/mlpf.py
+++ b/mlpf/model/mlpf.py
@@ -98,7 +98,7 @@ def __init__(
         self.norm1 = torch.nn.LayerNorm(embedding_dim)
         self.seq = torch.nn.Sequential(nn.Linear(embedding_dim, width), self.act(), nn.Linear(width, embedding_dim), self.act())
         self.dropout = torch.nn.Dropout(dropout_ff)
-        _logger.info("using attention_type={}".format(attention_type))
+        _logger.info("layer {} using attention_type={}".format(self.name, attention_type))
         # params for torch sdp_kernel
         if self.enable_ctx_manager:
             self.attn_params = {
diff --git a/mlpf/model/training.py b/mlpf/model/training.py
index 08de1b591..4c7c4f3ba 100644
--- a/mlpf/model/training.py
+++ b/mlpf/model/training.py
@@ -44,6 +44,8 @@
 
 
 def configure_model_trainable(model: MLPF, trainable: Union[str, List[str]], is_training: bool):
+    """Set only the given layers as trainable in the model"""
+
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         raise Exception("configure trainability before distributing the model")
     if is_training:
@@ -76,7 +78,7 @@ def train_step(batch, model, optimizer, lr_schedule, loss_fn):
         loss_fn: Loss function to use
 
     Returns:
-        dict: Dictionary containing all computed losses
+        dict: Dictionary containing all computed losses with gradient detached
     """
     ypred_raw = model(batch.X, batch.mask)
     ypred = unpack_predictions(ypred_raw)
@@ -320,6 +322,7 @@ def train_all_epochs(
     num_epochs,
     patience,
     outdir,
+    config,
     trainable="all",
     dtype=torch.float32,
     start_epoch=1,
@@ -356,6 +359,15 @@ def train_all_epochs(
         save_attention: Whether to save attention weights
         checkpoint_dir: Directory to save checkpoints
     """
+
+    # run per-worker setup here so all processes / threads get configured.
+    # Ignore divide by 0 errors
+    np.seterr(divide="ignore", invalid="ignore")
+    # disable GUI
+    import matplotlib
+
+    matplotlib.use("agg")
+
     # Setup tensorboard writers
     if (rank == 0) or (rank == "cpu"):
         tensorboard_writer_train = SummaryWriter(f"{outdir}/runs/train")
@@ -418,6 +430,21 @@ def train_all_epochs(
 
         # Handle checkpointing and early stopping on rank 0
         if (rank == 0) or (rank == "cpu"):
+
+            # evaluate the model at this epoch on test datasets, make plots, track metrics
+            testdir_name = f"_epoch_{epoch}"
+            for sample in config["test_dataset"]:
+                run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtype)
+                plot_metrics = make_plots(outdir, sample, config["dataset"], testdir_name, config["ntest"])
+
+                # track the following jet metrics in tensorboard
+                for k in ["med", "iqr", "match_frac"]:
+                    tensorboard_writer_valid.add_scalar(
+                        "epoch/{}/jet_ratio/jet_ratio_target_to_pred_pt/{}".format(sample, k),
+                        plot_metrics["jet_ratio"]["jet_ratio_target_to_pred_pt"][k],
+                        epoch,
+                    )
+
             # Log learning rate
             tensorboard_writer_train.add_scalar("epoch/learning_rate", lr_schedule.get_last_lr()[0], epoch)
 
@@ -432,7 +459,7 @@ def train_all_epochs(
             else:
                 stale_epochs += 1
 
-            # Periodic checkpointing
+            # Periodic epoch checkpointing
             if checkpoint_freq and (epoch % checkpoint_freq == 0):
                 checkpoint_path = f"{checkpoint_dir}/checkpoint-{epoch:02d}-{losses_valid['Total']:.6f}.pth"
                 save_checkpoint(checkpoint_path, model, optimizer, extra_state)
@@ -508,8 +535,7 @@ def train_all_epochs(
         # Synchronize processes
         if world_size > 1:
             dist.barrier()
-
-    # Training completed
+    # End loop over epochs, training completed
     _logger.info(f"Training completed. Total time on device {rank}: {(time.time() - t0_initial)/60:.3f}min")
 
     # Clean up
@@ -518,7 +544,83 @@ def train_all_epochs(
         tensorboard_writer_valid.close()
 
 
-def run(rank, world_size, config, args, outdir, logfile):
+def run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtype):
+    batch_size = config["gpu_batch_multiplier"]
+    version = config["test_dataset"][sample]["version"]
+
+    split_configs = config["test_dataset"][sample]["splits"]
+    _logger.info("split_configs={}".format(split_configs))
+
+    dataset = []
+
+    ntest = None
+    if not (config["ntest"] is None):
+        ntest = config["ntest"] // len(split_configs)
+
+    for split_config in split_configs:
+        ds = PFDataset(config["data_dir"], f"{sample}/{split_config}:{version}", "test", num_samples=ntest).ds
+        dataset.append(ds)
+    ds = torch.utils.data.ConcatDataset(dataset)
+
+    if (rank == 0) or (rank == "cpu"):
+        _logger.info(f"test_dataset: {sample}, {len(ds)}", color="blue")
+
+    if world_size > 1:
+        sampler = torch.utils.data.distributed.DistributedSampler(ds)
+    else:
+        sampler = torch.utils.data.RandomSampler(ds)
+
+    test_loader = torch.utils.data.DataLoader(
+        ds,
+        batch_size=batch_size,
+        collate_fn=Collater(["X", "ytarget", "ytarget_pt_orig", "ytarget_e_orig", "ycand", "genjets", "targetjets"], ["genmet"]),
+        sampler=sampler,
+        num_workers=config["num_workers"],
+        prefetch_factor=config["prefetch_factor"],
+        # pin_memory=use_cuda,
+        # pin_memory_device="cuda:{}".format(rank) if use_cuda else "",
+    )
+
+    if not osp.isdir(f"{outdir}/preds{testdir_name}/{sample}"):
+        if (rank == 0) or (rank == "cpu"):
+            os.system(f"mkdir -p {outdir}/preds{testdir_name}/{sample}")
+
+    _logger.info(f"Running predictions on {sample}")
+    torch.cuda.empty_cache()
+
+    # FIXME: import this from a central place
+    if config["dataset"] == "clic":
+        import fastjet
+
+        jetdef = fastjet.JetDefinition(fastjet.ee_genkt_algorithm, 0.4, -1.0)
+        jet_ptcut = 5
+    elif config["dataset"] == "cms":
+        import fastjet
+
+        jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)
+        jet_ptcut = 3
+    else:
+        raise Exception("not implemented")
+
+    device_type = "cuda" if isinstance(rank, int) else "cpu"
+    with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
+        run_predictions(
+            world_size,
+            rank,
+            model,
+            test_loader,
+            sample,
+            outdir,
+            jetdef,
+            jet_ptcut=jet_ptcut,
+            jet_match_dr=0.1,
+            dir_name=testdir_name,
+        )
+    if world_size > 1:
+        dist.barrier()  # block until all workers finished executing run_predictions()
+
+
+def run(rank, world_size, config, outdir, logfile):
     if (rank == 0) or (rank == "cpu"):  # keep writing the logs
         _configLogger("mlpf", filename=logfile)
 
@@ -566,7 +668,7 @@ def run(rank, world_size, config, args, outdir, logfile):
 
         if len(missing_keys) > 0:
             _logger.warning(f"The following parameters are missing in the checkpoint file {missing_keys}", color="red")
-            if args.relaxed_load:
+            if config["relaxed_load"]:
                 _logger.warning("Optimizer checkpoint will not be loaded", color="bold")
                 strict = False
             else:
@@ -612,13 +714,13 @@ def run(rank, world_size, config, args, outdir, logfile):
         _logger.info(f"Total parameters: {trainable_params + nontrainable_params}")
         _logger.info(table.to_string(index=False))
 
-    if args.train:
+    if config["train"]:
         if (rank == 0) or (rank == "cpu"):
-            save_HPs(args, model, model_kwargs, outdir)  # save model_kwargs and hyperparameters
+            save_HPs(config, model, model_kwargs, outdir)  # save model_kwargs and hyperparameters
             _logger.info("Creating experiment dir {}".format(outdir))
             _logger.info(f"Model directory {outdir}", color="bold")
 
-        if args.comet:
+        if config["comet"]:
             comet_experiment = create_comet_experiment(config["comet_name"], comet_offline=config["comet_offline"], outdir=outdir)
             comet_experiment.set_name(f"rank_{rank}_{Path(outdir).name}")
             comet_experiment.log_parameter("run_id", Path(outdir).name)
@@ -662,6 +764,7 @@ def run(rank, world_size, config, args, outdir, logfile):
             config["num_epochs"],
             config["patience"],
             outdir,
+            config,
             trainable=config["model"]["trainable"],
             dtype=dtype,
             start_epoch=start_epoch,
@@ -683,88 +786,15 @@ def run(rank, world_size, config, args, outdir, logfile):
     else:
         testdir_name = "_best_weights"
 
-    if args.test:
-        for sample in args.test_datasets:
-            batch_size = config["gpu_batch_multiplier"]
-            version = config["test_dataset"][sample]["version"]
-
-            split_configs = config["test_dataset"][sample]["splits"]
-            print("split_configs", split_configs)
-
-            dataset = []
-
-            ntest = None
-            if not (config["ntest"] is None):
-                ntest = config["ntest"] // len(split_configs)
-
-            for split_config in split_configs:
-                ds = PFDataset(config["data_dir"], f"{sample}/{split_config}:{version}", "test", num_samples=ntest).ds
-                dataset.append(ds)
-            ds = torch.utils.data.ConcatDataset(dataset)
-
-            if (rank == 0) or (rank == "cpu"):
-                _logger.info(f"test_dataset: {sample}, {len(ds)}", color="blue")
-
-            if world_size > 1:
-                sampler = torch.utils.data.distributed.DistributedSampler(ds)
-            else:
-                sampler = torch.utils.data.RandomSampler(ds)
-
-            test_loader = torch.utils.data.DataLoader(
-                ds,
-                batch_size=batch_size,
-                collate_fn=Collater(["X", "ytarget", "ytarget_pt_orig", "ytarget_e_orig", "ycand", "genjets", "targetjets"], ["genmet"]),
-                sampler=sampler,
-                num_workers=config["num_workers"],
-                prefetch_factor=config["prefetch_factor"],
-                # pin_memory=use_cuda,
-                # pin_memory_device="cuda:{}".format(rank) if use_cuda else "",
-            )
-
-            if not osp.isdir(f"{outdir}/preds{testdir_name}/{sample}"):
-                if (rank == 0) or (rank == "cpu"):
-                    os.system(f"mkdir -p {outdir}/preds{testdir_name}/{sample}")
-
-            _logger.info(f"Running predictions on {sample}")
-            torch.cuda.empty_cache()
-
-            # FIXME: import this from a central place
-            if config["dataset"] == "clic":
-                import fastjet
-
-                jetdef = fastjet.JetDefinition(fastjet.ee_genkt_algorithm, 0.4, -1.0)
-                jet_ptcut = 5
-            elif config["dataset"] == "cms":
-                import fastjet
-
-                jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)
-                jet_ptcut = 3
-            else:
-                raise Exception("not implemented")
-
-            device_type = "cuda" if isinstance(rank, int) else "cpu"
-            with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
-                run_predictions(
-                    world_size,
-                    rank,
-                    model,
-                    test_loader,
-                    sample,
-                    outdir,
-                    jetdef,
-                    jet_ptcut=jet_ptcut,
-                    jet_match_dr=0.1,
-                    dir_name=testdir_name,
-                )
-            if world_size > 1:
-                dist.barrier()  # block until all workers finished executing run_predictions()
-
-    if (rank == 0) or (rank == "cpu"):  # make plots only on a single machine
-        if args.make_plots:
+    if config["test"]:
+        for sample in config["test_dataset"]:
+            run_test(rank, world_size, config, outdir, model, sample, testdir_name, dtype)
 
+    # make plots only on a single machine
+    if (rank == 0) or (rank == "cpu"):
+        if config["make_plots"]:
             ntest_files = -1
-            # ntest_files = 1000
-            for sample in args.test_datasets:
+            for sample in config["test_dataset"]:
                 _logger.info(f"Plotting distributions for {sample}")
                 make_plots(outdir, sample, config["dataset"], testdir_name, ntest_files)
 
@@ -772,11 +802,12 @@ def run(rank, world_size, config, args, outdir, logfile):
         dist.destroy_process_group()
 
 
-def override_config(config, args):
-    """override config with values from argparse Namespace"""
+def override_config(config: dict, args):
+    """override config dictionary with values from argparse Namespace"""
     for arg in vars(args):
         arg_value = getattr(args, arg)
-        if arg_value is not None:
+        if (arg_value is not None) and (arg in config):
+            _logger.info("overriding config item {}={} with {} from cmdline".format(arg, config[arg], arg_value))
             config[arg] = arg_value
 
     if not (args.attention_type is None):
@@ -786,14 +817,15 @@ def override_config(config, args):
         for model in ["gnn_lsh", "attention", "attention", "mamba"]:
             config["model"][model]["num_convs"] = args.num_convs
 
-    if len(args.test_datasets) == 0:
-        args.test_datasets = config["test_dataset"]
+    if len(args.test_datasets) != 0:
+        config["test_dataset"] = args.test_datasets
 
     return config
 
 
-def device_agnostic_run(config, args, world_size, outdir):
-    if args.train:
+# Run either on CPU, single GPU or multi-GPU using pytorch
+def device_agnostic_run(config, world_size, outdir):
+    if config["train"]:
         logfile = f"{outdir}/train.log"
     else:
         logfile = f"{outdir}/test.log"
@@ -812,16 +844,16 @@ def device_agnostic_run(config, args, world_size, outdir):
 
             mp.spawn(
                 run,
-                args=(world_size, config, args, outdir, logfile),
+                args=(world_size, config, outdir, logfile),
                 nprocs=world_size,
                 join=True,
             )
         elif world_size == 1:
             rank = 0
             _logger.info(f"Will use single-gpu: {torch.cuda.get_device_name(rank)}", color="purple")
-            run(rank, world_size, config, args, outdir, logfile)
+            run(rank, world_size, config, outdir, logfile)
 
     else:
         rank = "cpu"
         _logger.info("Will use cpu", color="purple")
-        run(rank, world_size, config, args, outdir, logfile)
+        run(rank, world_size, config, outdir, logfile)
diff --git a/mlpf/model/utils.py b/mlpf/model/utils.py
index 3e7f3d8de..cc0567458 100644
--- a/mlpf/model/utils.py
+++ b/mlpf/model/utils.py
@@ -215,7 +215,7 @@ def unpack_predictions(preds):
     return ret
 
 
-def save_HPs(args, mlpf, model_kwargs, outdir):
+def save_HPs(config, mlpf, model_kwargs, outdir):
     """Simple function to store the model parameters and training hyperparameters."""
 
     with open(f"{outdir}/model_kwargs.pkl", "wb") as f:  # dump model architecture
@@ -224,7 +224,9 @@ def save_HPs(args, mlpf, model_kwargs, outdir):
     num_mlpf_parameters = sum(p.numel() for p in mlpf.parameters() if p.requires_grad)
 
     with open(f"{outdir}/hyperparameters.json", "w") as fp:  # dump hyperparameters
-        json.dump({**{"Num of mlpf parameters": num_mlpf_parameters}, **vars(args)}, fp)
+        outdict = {"num_mlpf_params": num_mlpf_parameters}
+        outdict.update(config)
+        json.dump(outdict, fp)
 
 
 def get_model_state_dict(model):
diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py
index 08a02b714..0f3d2117d 100644
--- a/mlpf/pipeline.py
+++ b/mlpf/pipeline.py
@@ -1,6 +1,5 @@
 """
-Developing a PyTorch Geometric supervised training of MLPF using DistributedDataParallel.
-
+PyTorch supervised training of MLPF using DistributedDataParallel or Ray Train.
 Authors: Farouk Mokhtar, Joosep Pata, Eric Wulff
 """
 
@@ -8,8 +7,6 @@
 import logging
 import os
 from pathlib import Path
-import matplotlib
-import numpy as np
 
 # comet needs to be imported before torch
 from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
@@ -110,10 +107,6 @@ def get_outdir(resume_training, load):
 
 
 def main():
-    # Ignore divide by 0 errors
-    np.seterr(divide="ignore", invalid="ignore")
-    matplotlib.use("agg")
-
     # https://github.com/pytorch/pytorch/issues/11201#issuecomment-895047235
     import torch
 
@@ -179,7 +172,7 @@ def main():
         if args.ray_train:
             run_ray_training(config, args, outdir)
         else:
-            device_agnostic_run(config, args, world_size, outdir)
+            device_agnostic_run(config, world_size, outdir)
 
 
 if __name__ == "__main__":
diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py
index 188eede05..8a79cf5e3 100644
--- a/mlpf/plotting/plot_utils.py
+++ b/mlpf/plotting/plot_utils.py
@@ -617,7 +617,13 @@ def plot_jet_ratio(
     if bins is None:
         bins = np.linspace(0, 5, 500)
 
+    ret_dict = {}
     p = med_iqr(yvals["jet_ratio_gen_to_target_pt"])
+    ret_dict["jet_ratio_gen_to_target_pt"] = {
+        "med": p[0],
+        "iqr": p[1],
+        "match_frac": awkward.count(yvals["jet_ratio_gen_to_target_pt"]) / awkward.count(yvals["jets_gen_pt"]),
+    }
     plt.hist(
         yvals["jet_ratio_gen_to_target_pt"],
         bins=bins,
@@ -627,6 +633,11 @@ def plot_jet_ratio(
     )
 
     p = med_iqr(yvals["jet_ratio_gen_to_cand_pt"])
+    ret_dict["jet_ratio_gen_to_cand_pt"] = {
+        "med": p[0],
+        "iqr": p[1],
+        "match_frac": awkward.count(yvals["jet_ratio_gen_to_cand_pt"]) / awkward.count(yvals["jets_gen_pt"]),
+    }
     plt.hist(
         yvals["jet_ratio_gen_to_cand_pt"],
         bins=bins,
@@ -636,6 +647,11 @@ def plot_jet_ratio(
     )
 
     p = med_iqr(yvals["jet_ratio_gen_to_pred_pt"])
+    ret_dict["jet_ratio_gen_to_pred_pt"] = {
+        "med": p[0],
+        "iqr": p[1],
+        "match_frac": awkward.count(yvals["jet_ratio_gen_to_pred_pt"]) / awkward.count(yvals["jets_gen_pt"]),
+    }
     plt.hist(
         yvals["jet_ratio_gen_to_pred_pt"],
         bins=bins,
@@ -671,6 +687,11 @@ def plot_jet_ratio(
     ax = plt.axes()
 
     p = med_iqr(yvals["jet_ratio_target_to_cand_pt"])
+    ret_dict["jet_ratio_target_to_cand_pt"] = {
+        "med": p[0],
+        "iqr": p[1],
+        "match_frac": awkward.count(yvals["jet_ratio_target_to_cand_pt"]) / awkward.count(yvals["jets_target_pt"]),
+    }
     plt.plot([], [])
     plt.hist(
         yvals["jet_ratio_target_to_cand_pt"],
@@ -680,6 +701,11 @@ def plot_jet_ratio(
         label="PF $({:.2f}\pm{:.2f})$".format(p[0], p[1]),
     )
     p = med_iqr(yvals["jet_ratio_target_to_pred_pt"])
+    ret_dict["jet_ratio_target_to_pred_pt"] = {
+        "med": p[0],
+        "iqr": p[1],
+        "match_frac": awkward.count(yvals["jet_ratio_target_to_pred_pt"]) / awkward.count(yvals["jets_target_pt"]),
+    }
     plt.hist(
         yvals["jet_ratio_target_to_pred_pt"],
         bins=bins,
@@ -698,6 +724,7 @@ def plot_jet_ratio(
         cp_dir=cp_dir,
         comet_experiment=comet_experiment,
     )
+    return ret_dict
 
 
 def plot_met(met_ratio, epoch=None, cp_dir=None, comet_experiment=None, title=None, sample=None, dataset=None):
diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml
index 0ebc7ab62..a8416ecf3 100644
--- a/parameters/pytorch/pyg-clic.yaml
+++ b/parameters/pytorch/pyg-clic.yaml
@@ -1,5 +1,7 @@
-backend: pytorch
-
+train: yes
+test: yes
+make_plots: yes
+comet: yes
 save_attention: yes
 dataset: clic
 sort_data: no
diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
index 030ffad66..029281d67 100644
--- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
+++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
@@ -1,5 +1,7 @@
-backend: pytorch
-
+train: yes
+test: yes
+make_plots: yes
+comet: no
 save_attention: no
 dataset: cms
 sort_data: yes
@@ -126,3 +128,6 @@ test_dataset:
   cms_pf_ttbar_nopu:
     version: 2.5.0
     splits: [1]
+  cms_pf_qcd_nopu:
+    version: 2.5.0
+    splits: [1]
diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml
index 30b3931a3..9b5e2cd9e 100644
--- a/parameters/pytorch/pyg-cms.yaml
+++ b/parameters/pytorch/pyg-cms.yaml
@@ -1,5 +1,7 @@
-backend: pytorch
-
+train: yes
+test: yes
+make_plots: yes
+comet: yes
 save_attention: no
 dataset: cms
 sort_data: yes