From d1fdba3fb1b04bd5bb69de14ffebdc60edfe723f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 14 May 2021 19:13:53 -0700
Subject: [PATCH 001/177] WIP - Add class MetatlasDataset

---
 docker/Dockerfile                             |   2 +-
 docker/local_jupyter.sh                       |   2 +-
 docker/metatlas_env.yaml                      |   1 +
 metatlas/datastructures/metatlas_dataset.py   | 478 +++++++++++++++
 metatlas/datastructures/object_helpers.py     |   6 +-
 metatlas/plots/chromplotplus.py               |   6 +-
 metatlas/plots/dill2plots.py                  | 561 +++++++++++-------
 metatlas/tools/logging.py                     | 102 ++++
 notebooks/reference/Targeted.ipynb            | 540 +++--------------
 noxfile.py                                    |   1 +
 tests/fixtures/metatlas_dataset_fixtures.py   |  21 +-
 tests/fixtures/metatlas_object_fixtures.py    |   8 +-
 tests/system/test_targeted.py                 |  91 +--
 tests/unit/conftest.py                        |   4 +-
 tests/unit/test_dill2plot.py                  |  22 +-
 .../unit/test_metatlas_get_data_helper_fun.py |  17 +-
 16 files changed, 1051 insertions(+), 811 deletions(-)
 create mode 100644 metatlas/datastructures/metatlas_dataset.py
 create mode 100644 metatlas/tools/logging.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 800b9416..cd3ff381 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM mambaorg/micromamba:0.12.2
+FROM mambaorg/micromamba:0.13.0
 
 ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci01
 ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
diff --git a/docker/local_jupyter.sh b/docker/local_jupyter.sh
index c7915416..23a29730 100755
--- a/docker/local_jupyter.sh
+++ b/docker/local_jupyter.sh
@@ -3,7 +3,7 @@
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR=$(dirname "$SCRIPT_DIR")
 OUT_DIR="$(pwd)/out"
-IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.1.0'
+IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0'
 PORT=8888
 
 while [[ "$#" -gt 0 ]]; do
diff --git a/docker/metatlas_env.yaml b/docker/metatlas_env.yaml
index fb8fdcbf..a934e16b 100644
--- a/docker/metatlas_env.yaml
+++ b/docker/metatlas_env.yaml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - alembic=1.5.8
   - banal=1.0.6
+  - colorama=0.4.4
   - dill=0.3.3
   - gspread=3.7.0
   - hdf5=1.10.6
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
new file mode 100644
index 00000000..68465684
--- /dev/null
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -0,0 +1,478 @@
+""" object oriented interface to metatlas_dataset """
+import datetime
+import getpass
+import logging
+import multiprocessing
+
+import humanize
+import pandas as pd
+
+from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.plots import dill2plots as dp
+
+MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+logger = logging.getLogger(__name__)
+
+
+class MetatlasDataset:
+    """
+    Like the non-object oriented metatlas_dataset, you can index into this class by file_idx and compound_idx:
+    metatlas_dataset = MetatlasDataset(atlas, groups)
+    metatlas_dataset[0][0]['identification'].compound[0].inchi_key
+
+    But MetatlasDataset adds additional functionality, such as:
+    metatlas_dataset.hits returns the msms hits dataframe
+    metatlas_dataset.atlas returns the atlas
+    metatlas_dataset.atlas_df returns the atlas dataframe
+
+    If you change a class property that another property is dependent on, then the second property
+    automatically is re-calculated the next time you access the second property. For example:
+    metatlas_dataset.extra_time = 0.5  # this invalidates the current hits property
+    metatlas_dataset.hits  # this re-generates the hits before returning them
+    """
+
+    # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods
+    def __init__(
+        self,
+        atlas,
+        groups,
+        extra_time=0.75,
+        extra_mz=0,
+        keep_nonmatches=True,
+        frag_mz_tolerance=0.01,
+        msms_refs_loc=MSMS_REFS_PATH,
+        max_cpus=1,
+    ):
+        self._atlas = atlas
+        self._atlas_df = None
+        self._atlas_df_valid = False
+        self._data = None
+        self._data_valid = False
+        self._hits = None
+        self._hits_valid = False
+        self._groups = groups
+        self._extra_time = extra_time
+        self._extra_mz = extra_mz
+        self._keep_nonmatches = keep_nonmatches
+        self._frag_mz_tolerance = frag_mz_tolerance
+        self._msms_refs_loc = msms_refs_loc
+        self.max_cpus = max_cpus
+
+    def _build(self):
+        """Populate self._data from database and h5 files."""
+        start_time = datetime.datetime.now()
+        files = []
+        for group in self._groups:
+            for h5_file in group.items:
+                files.append(
+                    (
+                        h5_file,
+                        group,
+                        self.atlas_df,
+                        self.atlas,
+                        self.extra_time,
+                        self.extra_mz,
+                    )
+                )
+        if self.max_cpus > 1:
+            with multiprocessing.Pool(processes=min(self.max_cpus, len(files))) as pool:
+                samples = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
+        else:  # skip multiprocessing as this makes for easier debugging
+            samples = [ma_data.get_data_for_atlas_df_and_file(i) for i in files]
+        self._data = [MetatlasSample(x) for x in samples]
+        logger.info(
+            "MetatlasDataset with %d files built in %s.",
+            len(files),
+            _duration_since(start_time),
+        )
+
+    def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
+        """
+        inputs:
+            keep_idxs: the indexes of compounds to keep
+            remove_idxs: the indexes of compounds to remove
+                Exactly one of keep_idxs or remove_idxs must be None
+            name: the name for the new atlas, defaults to current name + '_compound_filtered'
+        output:
+            If keep_idxs is not None then update self.atlas to contain only the compound_identifications at
+            keep_idxs. If remove_idxs is not None then update self.atlas to contain only the compound
+            identifications not at remove_idxs. Raises ValueError if both keep_idxs and remove_idxs are None.
+
+            There is an additional side effect that all mz_tolerances in the returned atlas
+            get their value from self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
+
+            Does not invalidate _data or _hits
+        """
+        if (keep_idxs is None) == (remove_idxs is None):
+            raise ValueError("Exactly one of keep_idxs and remove_idxs should be None")
+        start_len = len(self.atlas_df)
+        keep_idxs = keep_idxs if remove_idxs is None else self.atlas_df.index.difference(remove_idxs)
+        self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
+        self._atlas_df_valid = True
+        name = f"{self.atlas.name}_compound_filtered" if name is None else name
+        mz_tolerance = self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
+        if self._data_valid:
+            self._data = [
+                [compound for idx, compound in enumerate(sample) if idx in keep_idxs] for sample in self._data
+            ]
+        self._atlas = dp.make_atlas_from_spreadsheet(
+            self.atlas_df,
+            name,
+            filetype="dataframe",
+            polarity=self.polarity,
+            store=False,
+            mz_tolerance=mz_tolerance,
+        )
+        logger.info(
+            "Filtering reduced atlas from %d to %d compounds (%d removed).",
+            start_len,
+            len(self.atlas_df),
+            start_len - len(self.atlas_df),
+        )
+        if self._hits_valid:
+            self.filter_hits_by_atlas()
+
+    def filter_hits_by_atlas(self):
+        """Remove any hits that do not have a corresponding inchi_key-adduct pair in atlas_df"""
+        start_len = len(self.hits)
+        keep_adducts = self.atlas_df.loc[:, ["inchi_key", "adduct"]].drop_duplicates()
+        logger.info("Number of inchi_key-adduct pairs is %d.", len(keep_adducts))
+        hits_plus = self.hits.copy()
+        hits_plus["copy_index"] = hits_plus.index
+        new_hits = hits_plus.merge(keep_adducts, on=["inchi_key", "adduct"], how="inner")
+        logger.info("Number rows in new_hits is %d.", len(new_hits))
+        new_hits.index = pd.MultiIndex.from_tuples(new_hits["copy_index"], names=self.hits.index.names)
+        new_hits.drop(["copy_index"], axis=1)
+        self._hits = new_hits
+        logger.info(
+            "Filtering reduced number of MSMS hits from %d to %d (%d removed).",
+            start_len,
+            len(self.hits),
+            start_len - len(self.hits),
+        )
+
+    def filter_compounds_ms1_notes_remove(self, name=None):
+        """
+        inputs:
+            name: the name for the new atlas, defaults to current name + '_kept'
+        output:
+            updates self.atlas to contain only the compound_identifications that do not have ms1_notes
+            starting with 'remove' (case insensitive)
+            There is an additional side effect that all mz_tolerances in the returned atlas
+            get their value from self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
+        """
+        logger.debug("Filtering atlas to exclude ms1_notes=='remove'.")
+        name = f"{self.atlas.name}_kept" if name is None else name
+        self.filter_compounds(remove_idxs=self.compound_indices_marked_remove(), name=name)
+
+    def filter_compounds_by_signal(self, num_points, peak_height, name=None):
+        """
+        inputs:
+            num_points: number of points in EIC that must be exceeded in one or more samples
+                        in order for the compound to remain in the atlas
+            peak_height: max intensity in the EIC that must be exceeded in one or more samples
+                         in order for the compound to remain in the atlas
+        """
+        logger.debug("Filtering atlas on num_points=%d, peak_height=%d.")
+        name = f"{self.atlas.name}_strong" if name is None else name
+        keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
+        self.filter_compounds(keep_idxs=keep_idxs, name=name)
+
+    def store_atlas(self, name=None, even_if_exists=False):
+        """
+        inputs:
+            name: name to save to database, if None then use self.atlas.name
+            even_if_exists: if True, will save the atlas even if the atlas name already is in the database
+                            with your username
+        side effects:
+            Saves the altas to the database.
+            Raises ValueError if even_if_exists==False and name is already in the database with your username
+        """
+        name = self.atlas.name if name is None else name
+        username = getpass.getuser()
+        if not even_if_exists and len(metob.retrieve("atlases", name=name, username=username)) > 0:
+            raise ValueError(f"An atlas with name {name} and owned by {username} already exists.")
+        metob.store(self.atlas)
+
+    def export_atlas_to_csv(self, filename=None):
+        """
+        save atlas, including ms1_notes, ms2_notes, identification_notes, rt_min, rt_max to filename
+        if filename is not provided, then the export is saved to the working directory with filename
+        atlas.name + '.csv'
+        """
+        filename = f"{self.atlas.name}.csv" if filename is None else filename
+        dp.export_atlas_to_spreadsheet(self, filename)
+
+    def __getitem__(self, idx):
+        """get sample at idx"""
+        return self.data[idx]
+
+    def __setitem__(self, idx, value):
+        """assign value for sample at idx"""
+        self._data[idx] = value
+
+    def _set_and_invalidate_properties(self, attribute_name, new_value, property_names):
+        """
+        inputs:
+            attribute_name: name of the class attribute being modified
+            new_value: value to assign to attribute
+            propert_names: list of names of the class propertys that are dependent on the attribute's value
+        side effects:
+            If the property is valid and new_value is different from previous value, then invalidate.
+            And set attribute to new_value
+        """
+        for prop in property_names:
+            valid_attr_name = f"_{prop}_valid"
+            setattr(
+                self,
+                valid_attr_name,
+                getattr(self, valid_attr_name) and new_value == getattr(self, attribute_name),
+            )
+        setattr(self, f"_{attribute_name}", new_value)
+
+    @property
+    def data(self):
+        """data getter, update ._data if necessary"""
+        if not self._data_valid:
+            self._build()
+            self._data_valid = True
+        return self._data
+
+    @property
+    def atlas_df(self):
+        """atlas_df getter, update ._atlas_df if necessary"""
+        if not self._atlas_df_valid:
+            start_time = datetime.datetime.now()
+            self._atlas_df = ma_data.make_atlas_df(self.atlas)
+            self._atlas_df_valid = True
+            logger.info(
+                "Generated atlas_df with %d rows in %s.",
+                len(self.atlas_df),
+                _duration_since(start_time),
+            )
+        return self._atlas_df
+
+    @property
+    def atlas(self):
+        """atlas getter"""
+        return self._atlas
+
+    @atlas.setter
+    def atlas(self, atlas):
+        """atlas setter, invalidate atlas_df and data"""
+        self._set_and_invalidate_properties("atlas", atlas, ["atlas_df", "data"])
+
+    @property
+    def groups(self):
+        """groups getter"""
+        return self._groups
+
+    @groups.setter
+    def groups(self, groups):
+        """groups setter, invalidate data"""
+        self._set_and_invalidate_properties("groups", groups, ["data"])
+
+    @property
+    def polarity(self):
+        """polarity getter assumes all polarities within class are the same"""
+        return self.data[0][0]["identification"].mz_references[0].detected_polarity
+
+    @property
+    def extra_time(self):
+        """extra_time getter"""
+        return self._extra_time
+
+    @extra_time.setter
+    def extra_time(self, extra_time):
+        """extra_time setter, invalidates data and hits"""
+        self._set_and_invalidate_properties("extra_time", extra_time, ["data", "hits"])
+
+    @property
+    def extra_mz(self):
+        """extra_mz getter"""
+        return self._extra_mz
+
+    @extra_mz.setter
+    def extra_mz(self, extra_mz):
+        """extra_mz setter, invalidates data and hits"""
+        self._set_and_invalidate_properties("extra_mz", extra_mz, ["data", "hits"])
+
+    @property
+    def keep_nonmatches(self):
+        """keep_nonmatches getter"""
+        return self._keep_nonmatches
+
+    @keep_nonmatches.setter
+    def keep_nonmatches(self, keep_nonmatches):
+        """keep_nonmatches setter, invalidates hits"""
+        self._set_and_invalidate_properties("keep_nonmatches", keep_nonmatches, ["hits"])
+
+    @property
+    def frag_mz_tolerance(self):
+        """frag_mz_tolerance getter"""
+        return self._frag_mz_tolerance
+
+    @frag_mz_tolerance.setter
+    def frag_mz_tolerance(self, frag_mz_tolerance):
+        """frag_mz_tolerance setter, invlidates hits"""
+        self._set_and_invalidate_properties("frag_mz_tolerance", frag_mz_tolerance, ["hits"])
+
+    @property
+    def msms_refs_loc(self):
+        """msms_refs_loc getter"""
+        return self._msms_refs_loc
+
+    @msms_refs_loc.setter
+    def msms_refs_loc(self, msms_refs_loc):
+        """msms_refs_loc setter, invalidates hits"""
+        self._set_and_invalidate_properties("msms_refs_loc", msms_refs_loc, ["hits"])
+
+    @property
+    def hits(self):
+        """get msms hits DataFrame"""
+        if not self._hits_valid:
+            logger.info(
+                "Generating hits with extra_time=%.3f, frag_mz_tolerance=%.4f, msms_refs_loc=%s.",
+                self.extra_time,
+                self.frag_mz_tolerance,
+                self.msms_refs_loc,
+            )
+            start_time = datetime.datetime.now()
+            self._hits = dp.get_msms_hits(
+                self.data,
+                extra_time=self.extra_time > 0,
+                keep_nonmatches=self.keep_nonmatches,
+                frag_mz_tolerance=self.frag_mz_tolerance,
+                ref_loc=self.msms_refs_loc,
+            )
+            logger.info("Generated %d hits in %s.", len(self._hits), _duration_since(start_time))
+            self._hits_valid = True
+        return self._hits
+
+    def __len__(self):
+        """len is from data"""
+        return len(self.data)
+
+    def set_data(self, ids, value):
+        """update a value within self._data"""
+        self._atlas_df_valid = False
+        self._data_valid = False
+        _set_nested(self._data, ids, value)
+
+    @property
+    def rts(self):
+        """
+        Allow Rt_Reference objects to be accessed
+        Because this returns a list, the return value is mutable add will modify
+        data internal to this class, but atlas_df and data would get out of sync with atlas.
+        So don't modify the values returned by this property!
+        """
+        return [cid.rt_references[0] for cid in self.atlas.compound_identifications]
+
+    def set_rt(self, compound_idx, which, time):
+        """
+        inputs:
+            compound_idx: index of of compound to update
+            which: 'rt_min', 'rt_max', or 'rt_peak'
+            time: a floating point value for the number of minutes
+        updates the RT value in 3 places so that no datastructures need to be invalidated
+        """
+        assert which in ["rt_min", "rt_peak", "rt_max"]
+        atlas_rt_ref = self.atlas.compound_identifications[compound_idx].rt_references[0]
+        setattr(atlas_rt_ref, which, time)
+        data_rt_ref = self._data[0][compound_idx]["identification"].rt_references[0]
+        setattr(data_rt_ref, which, time)
+        self._atlas_df.loc[compound_idx, which] = time
+
+    def set_note(self, compound_idx, which, value):
+        """
+        inputs:
+            compound_idx: index of of compound to update
+            which: 'ms1_notes', 'ms2_notes' or 'identification_notes'
+            value: a string with the note content
+        updates the RT value in 3 places so that no datastructures need to be invalidated
+        """
+        assert which in ["ms1_notes", "ms2_notes", "identification_notes"]
+        atlas_cid = self.atlas.compound_identifications[compound_idx]
+        setattr(atlas_cid, which, value)
+        data_cid = self._data[0][compound_idx]["identification"]
+        setattr(data_cid, which, value)
+        self._atlas_df.loc[compound_idx, which] = value
+
+    def compound_indices_marked_remove(self):
+        """
+        outputs:
+            list of compound_idx of the compound identifications with ms1_notes to remove
+        """
+        ids = ["identification", "ms1_notes"]
+        return [i for i, j in enumerate(self.data[0]) if _is_remove(ma_data.extract(j, ids))]
+
+
+class MetatlasSample:
+    """
+    Object oriented interface to second level of metatlas_dataset. Each instance is one sample (LCMS run).
+    """
+
+    def __init__(self, data):
+        self._data = data
+
+    def __getitem__(self, idx):
+        """get sample at idx"""
+        return self._data[idx]
+
+    def __setitem__(self, idx, value):
+        """assign value for sample at idx"""
+        self._data[idx] = value
+
+    def __len__(self):
+        """len is from data"""
+        return len(self._data)
+
+
+def _duration_since(start):
+    """
+    inputs:
+        start: a datetime object of when the duration started
+    returns:
+        string with humanized duration of start to now
+    """
+    return humanize.precisedelta(datetime.datetime.now() - start)
+
+
+def _is_remove(obj):
+    """is obj a string that starts with 'remove' (case insensitive)?"""
+    return isinstance(obj, str) and obj.lower().startswith("remove")
+
+
+def _set_nested(data, ids, value):
+    """
+    inputs:
+        data: hierarchical data structure consisting of lists, dicts, and objects with attributes.
+        ids: a list of idices, key names, and attribute names
+        value: object
+    output:
+        modifies data in place so that the value is stored at the location indicated by the ids list
+
+    Strings in ids are first tried as key name and if no such key name exists, then they are
+    tried as attribute names. To designate that a member of ids should be used as an attribute
+    and not a key name, make it a tuple with the attribute name string as the first member, such
+    as: ('attribute_name',). If you want to make it more explict to the reader, you can add a
+    second member to the tuple, which will not be used, such as ('attribute_name', 'as attribute')
+    """
+    if len(ids) == 0:
+        raise ValueError("ids cannot be empty")
+    if len(ids) == 1:
+        if isinstance(ids[0], tuple):
+            setattr(data, ids[0][0], value)
+        elif isinstance(ids[0], str) and hasattr(data, ids[0]):
+            setattr(data, ids[0], value)
+        else:
+            data[ids[0]] = value  # works for list or dict
+    else:
+        if isinstance(ids[0], tuple):
+            _set_nested(getattr(data, ids[0][0]), ids[1:], value)
+        elif isinstance(ids[0], str) and hasattr(data, ids[0]):
+            _set_nested(getattr(data, ids[0]), ids[1:], value)
+        else:
+            _set_nested(data[ids[0]], ids[1:], value)
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index db7a73d2..116de876 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
+
+import logging
 import sys
 import os
 import getpass
@@ -26,9 +28,11 @@
         CFloat, CBool)
 
 
+logger = logging.getLogger(__name__)
+
 # Whether we are running from NERSC
 ON_NERSC = 'METATLAS_LOCAL' not in os.environ
-print(('NERSC=',ON_NERSC))
+logger.info('NERSC=%s', ON_NERSC)
 
 # Observable List from
 # http://stackoverflow.com/a/13259435
diff --git a/metatlas/plots/chromplotplus.py b/metatlas/plots/chromplotplus.py
index b0043da7..77c64431 100644
--- a/metatlas/plots/chromplotplus.py
+++ b/metatlas/plots/chromplotplus.py
@@ -53,9 +53,9 @@ def __init__(self, data, shortname,
         self.compound_eics = [CompoundFileEIC(compound_file_data,
                                               self.rt_bounds,
                                               self.rt_min,
-                                              self.rt_max, 
+                                              self.rt_max,
                                               shortname) for compound_file_data in data]
-        
+
         self.compound_eics = sorted(self.compound_eics,
                                     key = lambda c: (c.group_name,
                                                      c.file_name))
@@ -142,7 +142,7 @@ def __make_figure(self):
         #EICs
         eic_lines = [np.matmul(tso_transform[i], e)[0:2].T
                      for i,e in enumerate(self.__make_eics())]
-        self.ax.add_collection(mc.LineCollection(eic_lines, 2.0/(num_cols*num_rows)**.5))
+        self.ax.add_collection(mc.LineCollection(eic_lines, linewidths=2.0/(num_cols*num_rows)**.5))
 
         #RT bounds
         rt_bound_lines = np.matmul(tso_transform,
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index b4756b5d..355b26e1 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from __future__ import print_function
+import logging
 import sys
 import os
 import os.path
@@ -54,6 +55,8 @@
 from functools import reduce
 from io import StringIO
 
+logger = logging.getLogger(__name__)
+
 ADDUCT_INFO = {'[2M+H]': {'charge': '1',
               'color': '#fabebe',
               'common': True,
@@ -254,6 +257,7 @@ def __init__(self,
            Flag for removal: 'x'
            Toggle highlighting of overlapping RT ranges for similar compounds: 's'
         """
+        logger.debug("Initializing new instance of %s.", self.__class__.__name__)
         self.data = data
         self.msms_hits = msms_hits.sort_values('score', ascending=False)
         self.color_me = color_me if color_me != '' else [['black', '']]
@@ -271,19 +275,13 @@ def __init__(self,
         self.msms_flags = msms_flags
         self.adjustable_rt_peak = adjustable_rt_peak
 
-        self.compounds = self.retrieve_compounds()
-        self.rts = self.retrieve_rts()
         self.file_names = ma_data.get_file_names(self.data)
         self.configure_flags()
         self.filter_runs(include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
-        self.atlas = metob.retrieve('Atlas', unique_id=self.data[0][0]['atlas_unique_id'],
-                                    username='*')[-1]
-
         self.similar_rects = []
-        print(("loaded file for username = ", self.atlas.username))
         # only the atlas owner can change RT limits or flags
-        self.enable_edit = getpass.getuser() == self.atlas.username
+        self.enable_edit = getpass.getuser() == self.data.atlas.username
         self.hit_ctr = 0
         self.msms_zoom_factor = 1
         # native matplotlib key bindings that we want to override
@@ -301,14 +299,17 @@ def __init__(self,
         self.set_plot_data()
 
     def set_plot_data(self):
+        logger.debug('Starting replot')
         self.similar_compounds = self.get_similar_compounds()
         self.eic_plot()
         self.filter_hits()
         self.msms_plot()
         self.flag_radio_buttons()
         plt.show()
+        logger.debug('Finished replot')
 
     def eic_plot(self):
+        logger.debug('Starting eic_plot')
         self.ax.set_title('')
         self.ax.set_xlabel('Retention Time')
         # set y-scale and bounds if provided
@@ -325,13 +326,15 @@ def eic_plot(self):
                                                        self.set_lin_log, active_idx=idx)
         self.rt_bounds()
         self.highlight_similar_compounds()
+        logger.debug('Finished eic_plot')
 
     def flag_radio_buttons(self):
-        my_id = self.compounds[self.data[0][self.compound_idx]['identification'].unique_id]
+        my_id = self.data[0][self.compound_idx]['identification']
         if my_id.ms1_notes in self.peak_flags:
             peak_flag_index = self.peak_flags.index(my_id.ms1_notes)
         else:
             peak_flag_index = 0
+        logger.debug('Setting peak flag radio button with index %d', peak_flag_index)
         self.peak_flag_radio = self.create_radio_buttons(self.peak_flag_ax, self.peak_flags,
                                                          self.set_peak_flag,
                                                          active_idx=peak_flag_index)
@@ -340,6 +343,7 @@ def flag_radio_buttons(self):
             msms_flag_index = self.msms_flags.index(my_id.ms2_notes)
         else:
             msms_flag_index = 0
+        logger.debug('Setting msms flag radio button with index %d', msms_flag_index)
         self.msms_flag_radio = self.create_radio_buttons(self.msms_flag_ax, self.msms_flags,
                                                          self.set_msms_flag,
                                                          active_idx=msms_flag_index)
@@ -356,7 +360,7 @@ def y_max_slider(self):
     def rt_bounds(self):
         # put vlines on plot before creating sliders, as adding the vlines may increase plot
         # width, as the vline could occur outside of the data points
-        rt = self.rts[self.compound_idx]
+        rt = self.data.rts[self.compound_idx]
         self.min_line = self.ax.axvline(rt.rt_min, color=self.min_max_color, linewidth=4.0)
         self.max_line = self.ax.axvline(rt.rt_max, color=self.min_max_color, linewidth=4.0)
         self.peak_line = self.ax.axvline(rt.rt_peak, color=self.peak_color, linewidth=4.0)
@@ -417,7 +421,7 @@ def display_eic_data(self):
                     else:
                         zorder = 1
                         color = 'black'
-                    self.ax.plot(x, y, 'k-', zorder=zorder, linewidth=2, alpha=self.alpha,
+                    self.ax.plot(x, y, '-', zorder=zorder, linewidth=2, alpha=self.alpha,
                                  picker=True, pickradius=5, color=color, label=label)
 
     def filter_runs(self, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups):
@@ -470,13 +474,14 @@ def filter_hits(self):
         hits_mz_tolerance = ident.mz_references[-1].mz_tolerance*1e-6
         mz_theoretical = ident.mz_references[0].mz
         my_scan_rt = self.msms_hits.index.get_level_values('msms_scan')
-        self.hits = self.msms_hits[(my_scan_rt >= float(self.rts[self.compound_idx].rt_min)) &
-                                   (my_scan_rt <= float(self.rts[self.compound_idx].rt_max)) &
+        self.hits = self.msms_hits[(my_scan_rt >= float(self.data.rts[self.compound_idx].rt_min)) &
+                                   (my_scan_rt <= float(self.data.rts[self.compound_idx].rt_max)) &
                                    (self.msms_hits['inchi_key'] == inchi_key) &
                                    within_tolerance(self.msms_hits['measured_precursor_mz'],
                                                     mz_theoretical, hits_mz_tolerance)]
 
     def msms_plot(self, font_scale=10.0):
+        logger.debug('Starting msms_plot')
         compound = None
         hit_file_name = None
         if not self.hits.empty:
@@ -495,6 +500,7 @@ def msms_plot(self, font_scale=10.0):
                                               hit_ref, self.msms_zoom_factor)
         min_x = self.ax2.get_xlim()[0]  # fails if original location is not within plot
         self.mz_annot = self.ax2.annotate('', xy=(min_x, 0), visible=False)
+        logger.debug('Finished msms_plot')
 
     def layout_figure(self):
         self.gui_scale_factor = self.height/3.25 if self.height < 3.25 else 1
@@ -573,25 +579,29 @@ def create_radio_buttons(self, axes, labels, on_click_handler, active_idx=0):
         return buttons
 
     def set_lin_log(self, label):
+        logger.debug('Y-scale of EIC plot set to %s scale.', label)
         self.ax.set_yscale(label)
         if label == 'linear':
             self.ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
         self.fig.canvas.draw_idle()
 
+    def set_flag(self, name, value):
+        logger.debug('Setting flag "%s" to "%s".', name, value)
+        self.data.set_note(self.compound_idx, name, value)
+        metob.store(self.data[0][self.compound_idx]['identification'])
+
     def set_peak_flag(self, label):
-        my_id = self.compounds[self.data[0][self.compound_idx]['identification'].unique_id]
-        my_id.ms1_notes = label
-        metob.store(my_id)
+        self.set_flag('ms1_notes', label)
 
     def set_msms_flag(self, label):
-        my_id = self.compounds[self.data[0][self.compound_idx]['identification'].unique_id]
-        my_id.ms2_notes = label
-        metob.store(my_id)
+        self.set_flag('ms2_notes', label)
 
     def on_pick(self, event):
         thisline = event.artist
         thisline.set_color('cyan')
-        self.ax.set_title(thisline.get_label(), fontsize=7)
+        label = thisline.get_label()
+        self.ax.set_title(label, fontsize=7)
+        logger.debug("Sample %s selected on EIC plot via mouse click event.", label)
 
     def on_motion(self, event):
         if event.inaxes == self.ax2:  # in msms mirror plot
@@ -624,38 +634,50 @@ def press(self, event):
         if event.key in ['right', 'l']:
             if self.compound_idx + 1 < len(self.data[0]):
                 self.compound_idx += 1
+                logger.debug("Increasing compound_idx to %d (inchi_key:%s adduct:%s).",
+                             self.compound_idx,
+                             self.data[0][self.compound_idx]['identification'].compound[0].inchi_key,
+                             self.data[0][self.compound_idx]['identification'].mz_references[0].adduct
+                             )
                 self.hit_ctr = 0
                 self.update_plots()
         elif event.key in ['left', 'h']:
             if self.compound_idx > 0:
                 self.compound_idx -= 1
+                logger.debug("Decreasing compound_idx to %d (inchi_key:%s adduct:%s).",
+                             self.compound_idx,
+                             self.data[0][self.compound_idx]['identification'].compound[0].inchi_key,
+                             self.data[0][self.compound_idx]['identification'].mz_references[0].adduct
+                             )
                 self.hit_ctr = 0
                 self.update_plots()
         elif event.key in ['up', 'k']:
             if self.hit_ctr > 0:
                 self.hit_ctr -= 1
+                logger.debug("Decreasing hit_ctr to %d.", self.hit_ctr)
                 self.update_plots()
         elif event.key in ['down', 'j']:
             if self.hit_ctr < len(self.hits) - 1:
+                logger.debug("Increasing hit_ctr to %d.", self.hit_ctr)
                 self.hit_ctr += 1
                 self.update_plots()
         elif event.key == 'x':
             if not self.enable_edit:
                 self.warn_if_not_atlas_owner()
                 return
+            logger.debug("Removing compound %d via 'x' key event.", self.compound_idx)
             self.peak_flag_radio.set_active(1)
-            # This is really hacky, but using set_peak_flag function above didn't work.
-            my_id = self.compounds[self.data[0][self.compound_idx]['identification'].unique_id]
-            my_id.ms1_notes = 'remove'
-            metob.store(my_id)
         elif event.key == 'z':
             self.msms_zoom_factor = 1 if self.msms_zoom_factor == 25 else self.msms_zoom_factor * 5
+            logger.debug("Setting msms zoom factor to %d.", self.msms_zoom_factor)
             self.msms_plot()
         elif event.key == 's':
             if self.similar_rects:
+                logger.debug("Removing highlight of similar compounds on EIC plot.")
                 self.unhighlight_similar_compounds()
             else:
                 self.similar_compounds = self.get_similar_compounds()
+                logger.debug("Enabling highlight of similar compounds on EIC plot.")
                 self.highlight_similar_compounds()
 
     def update_y_scale(self, val):
@@ -668,10 +690,11 @@ def update_y_scale(self, val):
 
     def warn_if_not_atlas_owner(self):
         user = getpass.getuser()
-        if user != self.atlas.username:
+        if user != self.data.atlas.username:
             text = ("YOU ARE %s. YOU ARE NOT THE ATLAS OWNER."
                     "YOU ARE NOT ALLOWED TO EDIT VALUES WITH THE RT CORRECTOR.")
             self.ax.set_title(text % user)
+            logger.warning(text, user)
 
     def update_rt(self, which, val):
         """
@@ -679,13 +702,13 @@ def update_rt(self, which, val):
             which: 'rt_min', 'rt_max', or 'rt_peak'
             val: new RT value
         """
+        logger.debug("Updating %s to %0.4f", which, val)
         slider = {'rt_min': self.rt_min_slider, 'rt_peak': self.rt_peak_slider,
                   'rt_max': self.rt_max_slider}
         line = {'rt_min': self.min_line, 'rt_peak': self.peak_line, 'rt_max': self.max_line}
-        setattr(self.data[0][self.compound_idx]['identification'].rt_references[-1], which, val)
-        setattr(self.rts[self.compound_idx], which, val)
+        self.data.set_rt(self.compound_idx, which, val)
         slider[which].valinit = val
-        metob.store(self.rts[self.compound_idx])
+        metob.store(self.data.rts[self.compound_idx])
         line[which].set_xdata((val, val))
         if which != 'rt_peak':
             self.msms_zoom_factor = 1
@@ -709,12 +732,6 @@ def retrieve_compounds(self):
         compounds_list = metob.retrieve('CompoundIdentification', unique_id=uids, username='*')
         return {c.unique_id: c for c in compounds_list}
 
-    def retrieve_rts(self):
-        uids = {i: x['identification'].rt_references[-1].unique_id for i, x in enumerate(self.data[0])}
-        rt_list = metob.retrieve('RTReference', unique_id=list(uids.values()), username='*')
-        rt_dict = {rt.unique_id: rt for rt in rt_list}
-        return {i: rt_dict[uids[i]] for i in uids.keys()}
-
     def get_similar_compounds(self, use_labels=True):
         """
         inputs:
@@ -743,8 +760,9 @@ def get_similar_compounds(self, use_labels=True):
             if (mz_ref-0.005 <= cid_mz_ref <= mz_ref+0.005) or (mass-0.005 <= cid_mass <= mass+0.005):
                 out.append({'index': compound_iter_idx,
                             'label': cpd_iter_id.name if use_labels else cpd_iter_id.compound[0].name,
-                            'rt': self.rts[compound_iter_idx],
-                            'overlaps': rt_range_overlaps(self.rts[self.compound_idx], self.rts[compound_iter_idx])})
+                            'rt': self.data.rts[compound_iter_idx],
+                            'overlaps': rt_range_overlaps(self.data.rts[self.compound_idx],
+                                                          self.data.rts[compound_iter_idx])})
         return out
 
 class adjust_mz_for_selected_compound(object):
@@ -810,7 +828,7 @@ def __init__(self,
         # warn the user if they do not own the atlas; and can not edit its values
         self.enable_edit = True
         self.atlas = metob.retrieve('Atlas',unique_id = self.data[0][0]['atlas_unique_id'],username='*')[-1]
-        print(("loaded file for username = ", self.atlas.username))
+        logging.info("loaded file for username = %s", self.atlas.username)
         if getpass.getuser() != self.atlas.username:
             self.ax.set_title("YOUR ARE %s YOU ARE NOT ALLOWED TO EDIT VALUES THE RT CORRECTOR. USERNAMES ARE NOT THE SAME"%getpass.getuser())
             self.enable_edit = False
@@ -2255,7 +2273,7 @@ def get_msms_hits(metatlas_dataset, use_labels=False, extra_time=False, keep_non
 
                     msms_hits.append(scan_df)
 
-    sys.stdout.write('\n'+'Done!!!')
+    sys.stdout.write('\n'+'Done!!!\n')
     if len(msms_hits)>0:
         hits = pd.concat(msms_hits)
         return hits
@@ -2796,21 +2814,50 @@ def filter_by_remove(atlas_df, data):
     return(atlas_df.iloc[keep_idxs].copy(), atlas_df.iloc[rm_idxs].copy())
 
 
-def filter_atlas(atlas_df = '', input_dataset = [], num_data_points_passing = 5, peak_height_passing = 1e6):
-    metatlas_dataset = input_dataset
-    num_data_points_passing = np.array([[(metatlas_dataset[i][j]['data']['eic'] is not None) and
-        (metatlas_dataset[i][j]['data']['eic']['intensity'] is not None) and
-        (len(metatlas_dataset[i][j]['data']['eic']['intensity']) > num_data_points_passing)
-                                     for i in range(len(metatlas_dataset))]
-                                     for j in range(len(metatlas_dataset[0]))]).any(axis=1)
+def get_intensity(compound):
+    """
+    inputs:
+        compound: a CompoundIdentification object
+    returns a list of intensity values or an empty list if the intensity attribute does not exist
+    """
+    return ma_data.extract(compound, ['data', 'eic', 'intensity'], [])
+
+
+def filter_atlas(atlas_df, data, num_data_points_passing=5, peak_height_passing=1e6):
+    """
+    inputs:
+        atlas_df: panda DataFrame containing an atlas
+        data: metatlas_dataset
+        num_data_points_passing: number of points in EIC that must be exceeded in one or more samples
+                                 in order for the compound to remain in the atlas
+        peak_height_passing: max intensity in EIC that must be exceeded in one or more samples
+                             in order for the compound to remain in the atlas
+    returns a pandas DataFrame containing the updated atlas
+    """
+    keep_idxs = strong_signal_compound_idxs(data, num_data_points_passing, peak_height_passing)
+    return atlas_df.iloc[keep_idxs].reset_index(drop=True)
+
+
+def strong_signal_compound_idxs(data, num_points_passing, peak_height_passing):
+    """
+    inputs:
+        data: metatlas_dataset
+        num_data_points_passing: number of points in EIC that must be exceeded in one or more samples
+                                 in order for the compound to remain in the atlas
+        peak_height_passing: max intensity in EIC that must be exceeded in one or more samples
+                             in order for the compound to remain in the atlas
+    returns list of indices that are above the thresholds
+    """
+    num_passing = np.array([
+        [len(get_intensity(compound)) > num_points_passing for compound in sample]
+        for sample in data]
+    ).any(axis=0)
+    peak_passing = np.array([
+        [np.array(get_intensity(compound)+[0]).max() > peak_height_passing for compound in sample]
+        for sample in data]
+    ).any(axis=0)
+    return np.flatnonzero(num_passing & peak_passing).tolist()
 
-    peak_height_passing = np.array([[(metatlas_dataset[i][j]['data']['eic'] is not None) and
-        (metatlas_dataset[i][j]['data']['eic']['intensity'] is not None) and
-        (np.array(metatlas_dataset[i][j]['data']['eic']['intensity']+[0]).max()>peak_height_passing)
-                                 for i in range(len(metatlas_dataset))]
-                                 for j in range(len(metatlas_dataset[0]))]).any(axis=1)
-    compound_passing = num_data_points_passing & peak_height_passing
-    return atlas_df[compound_passing].reset_index(drop=True)
 
 def filter_metatlas_objects_to_most_recent(object_list,field):
     #from datetime import datetime, date
@@ -2916,28 +2963,37 @@ def make_groups_from_fileinfo_sheet(filename,filetype='tab',store=False):
             metob.store(myGroup)
     return return_groups
 
-def check_compound_names(df):
-    # compounds that have the wrong compound name will be listed
-    # Keep running this until no more compounds are listed
+
+def check_compound_names(atlas_df):
+    """
+    inputs:
+        atlas_df: pandas dataframe representation of an atlas
+    throws ValueError if some compounds are not found in the database
+    """
     bad_names = []
-    for i,row in df.iterrows():
-        #if type(df.name[x]) != float or type(df.label[x]) != float:
-            #if type(df.name[x]) != float:
-        if (not pd.isnull(row.inchi_key)) and (len(row.inchi_key)>0):# or type(df.inchi_key[x]) != float:
-            if not metob.retrieve('Compounds',inchi_key=row.inchi_key, username = '*'):
-                print((row.inchi_key, "compound is not in database. Exiting Without Completing Task!"))
+    for _, row in atlas_df.iterrows():
+        if pd.notna(row.inchi_key):
+            if not metob.retrieve('Compounds', inchi_key=row.inchi_key, username='*'):
                 bad_names.append(row.inchi_key)
-    return bad_names
+    if bad_names:
+        raise ValueError(f"Compound not found in database: {', '.join(bad_names)}.")
 
 
-def check_file_names(df,field):
+def check_filenames(atlas_df, field):
+    """
+    inputs:
+        atlas_df: pandas dataframe representation of an atlas
+        field: column name in atlas_df to test for valid lcmsruns
+    throws ValueError if values in atlas_df[field] are not in database as lcmsruns
+    """
     bad_files = []
-    for i,row in df.iterrows():
-        if row[field] != '':
-            if not metob.retrieve('Lcmsruns',name = '%%%s%%'%row[field],username = '*'):
-                print((row[field], "file is not in the database. Exiting Without Completing Task!"))
+    for _, row in atlas_df.iterrows():
+        if field in row:
+            name = row[field].replace('.mzmL', '')
+            if not metob.retrieve('Lcmsruns', name=f"%{name}%", username='*'):
                 bad_files.append(row[field])
-    return bad_files
+    if bad_files:
+        raise ValueError(f"LCMS runs not found in database: {', '.join(bad_files)}.")
 
 
 # def get_formatted_atlas_from_google_sheet(polarity='POS',
@@ -2981,181 +3037,226 @@ def check_file_names(df,field):
 #     return df3
 
 
-def make_atlas_from_spreadsheet(filename='valid atlas file.csv',
-                                atlas_name='20161007_MP3umZHILIC_BPB_NEG_ExampleAtlasName',
-                                filetype=('excel','csv','tab','dataframe'),
-                                sheetname='only for excel type input',
-                                polarity = ('positive','negative'),
-                                store=False,
-                                mz_tolerance=None):
-    '''
-    specify polarity as 'positive' or 'negative'
+def _clean_dataframe(dataframe, required_columns=None, lower_case_col_names=True):
+    """
+    inputs:
+        dataframe: pandas dataframe
+        required_columns: list of column names that must have a non-NA values
+        lower_case_col_names: should column names be modified to lower case
+    Modifies dataframe in place. The following rows removed:
+        fully empty (all fields have NA values)
+        containing required_columns with 1 or more NA values
+    """
+    dataframe.dropna(how="all", inplace=True)
+    if required_columns is not None and len(required_columns) > 0:
+        dataframe.dropna(how="any", subset=required_columns, inplace=True)
+    if lower_case_col_names:
+        dataframe.columns = [x.lower() for x in dataframe.columns]
 
-    '''
-    if isinstance(filename,pd.DataFrame):
-        df = filename
+
+def _add_columns(dataframe, column_names, default_values=None):
+    """
+    inputs:
+        dataframe: pandas dataframe
+        column_names: list of column names to add to dataframe if they do not already exist
+        default_values: a single default value for all columns or a list of default values
+                        the same length as column_names
+    Modifies the dataframe in place
+    """
+    assert isinstance(column_names, list)
+    num_col = len(column_names)
+    if isinstance(default_values, str):
+        default_values = [default_values]
+    num_default = 1 if default_values is None else len(default_values)
+    assert num_default in [1, num_col]
+    default_values = [default_values]*num_col if num_default == 1 else default_values
+    for name, default in zip(column_names, default_values):
+        if name not in dataframe.columns:
+            dataframe[name] = default
+
+
+def _get_dataframe(filename_or_df=None, filetype=None, sheetname=None):
+    """
+    inputs:
+        filename_or_df: a filename to an excel, tsv or csv file, or a pandas DataFrame
+        filetype: a string in dataframe, excel, tab, csv
+        sheetname: name of a sheet in an excel file, or get first sheet if None
+    returns a pandas Dataframe
+    """
+    assert filetype in ['dataframe', 'excel', 'tab', 'csv']
+    if filetype == 'dataframe':
+        return filename_or_df.copy()
+    if filetype == 'excel':
+        return pd.read_excel(filename_or_df, sheetname=0 if sheetname is None else sheetname)
+    return pd.read_csv(filename_or_df, sep='\t' if filetype == 'tab' else ',')
+
+
+def get_compound_identification(row, polarity, mz_tolerance):
+    my_id = metob.CompoundIdentification()
+    # currently, all copies of the molecule are returned.  The 0 is the most recent one.
+    compound_list = metob.retrieve('Compounds', inchi_key=row.inchi_key, username='*')
+    if compound_list is None:
+        return None
+    my_id.compound = compound_list[-1:]
+    my_id.name = row.label if isinstance(row.label, str) else 'no label'
+    _copy_attributes(row, my_id, ['do_normalization', 'internal_standard_id', 'internal_standard_to_use',
+                                  'identification_notes', 'ms1_notes', 'ms2_notes'])
+    my_id.mz_references = get_mz_references(row, polarity, mz_tolerance)
+    my_id.rt_references = get_rt_references(row)
+    my_id.frag_references = get_frag_references(row, my_id.name, polarity,
+                                                my_id.mz_references[0], my_id.rt_references[0])
+    my_id.intensity_references = []
+    return my_id
+
+
+def get_mz_references(row, polarity, mz_tolerance=None):
+    assert polarity in ['positive', 'negative']
+    mzr = metob.MzReference()
+    mzr.mz = row.mz
+    # TODO: calculate the mz from theoretical adduct and modification if provided.
+    #     my_id.mz_references[0].mz = c.MonoIso topic_molecular_weight + 1.007276
+    if mz_tolerance is not None:
+        mzr.mz_tolerance = mz_tolerance
     else:
-        if ( filetype=='excel' ) and sheetname:
-            df = pd.read_excel(filename,sheetname=sheetname)
-        elif ( filetype=='excel' ):
-            df = pd.read_excel(filename)
-        elif filetype == 'tab':
-            df = pd.read_csv(filename,sep='\t')
-        else:
-            df = pd.read_csv(filename,sep=',')
-    df.dropna(how="all", inplace=True)
-    df.columns = [x.lower() for x in df.columns]
+        try:
+            mzr.mz_tolerance = row.mz_tolerance
+        except AttributeError:
+            mzr.mz_tolerance = row.mz_threshold
+    mzr.mz_tolerance_units = 'ppm'
+    mzr.detected_polarity = polarity
+    # if 'file_mz' in atlas_df.keys():
+    #     f = metob.retrieve('Lcmsruns',name = '%%%s%%'%atlas_df.file_mz[x],username = '*')[0]
+    #     mzRef.lcms_run = f
+    if pd.notna(row.adduct):
+        mzr.adduct = row.adduct
+    return [mzr]
+
+
+def get_rt_references(row):
+    rtr = metob.RtReference()
+    rtr.rt_units = 'min'
+    _copy_attributes(row, rtr, ['rt_min', 'rt_max', 'rt_peak'], error_on_missing=True)
+    # if 'file_rt' in atlas_df.keys():
+    #     f = metob.retrieve('Lcmsruns',name = '%%%s%%'%atlas_df.file_rt[x],username = '*')[0]
+    #     rtr.lcms_run = f
+    return [rtr]
+
+
+def get_frag_references(row, name, polarity, mz_ref, rt_ref):
+    """
+    inputs:
+        row: atlas_df row for the compound identification of interest
+        name: compound name
+        polarity: positive or negative
+        mz_ref: MzReference object
+        rt_ref: RtReference object
+    returns an array of FragmentationReferences or empty array if no msms data is found
+    """
+    assert polarity in ['positive', 'negative']
+    try:
+        run_name = row.file_msms.replace('.mzmL', '')
+        run = metob.retrieve('Lcmsruns', name=f"%{run_name}%", username='*')[0]
+    except (AttributeError, IndexError):
+        return []
+    data = ma_data.get_data_for_a_compound(mz_ref, rt_ref, ['msms'], run.hdf5_file, extra_time=0.3)
+    if not isinstance(data['msms']['data'], np.ndarray):
+        return []
+    frag_ref = metob.FragmentationReference()
+    frag_ref.lcms_run = run
+    frag_ref.polarity = polarity
+    frag_ref.precursor_mz = row.mz
+    precursor_intensity = data['msms']['data']['precursor_intensity']
+    idx_max = np.argwhere(precursor_intensity == np.max(precursor_intensity)).flatten()
+    mz_list = data['msms']['data']['mz'][idx_max]
+    intensity_list = data['msms']['data']['i'][idx_max]
+    frag_ref.mz_intensities = get_spectrum(mz_list, intensity_list)
+    logger.info('Found reference msms spectrum for %s in file %s.', name, row.file_msms)
+    return [frag_ref]
+
+
+def get_spectrum(mz_list, intensity_list):
+    """
+    inputs:
+        mz_list: list of mz values
+        intensity_list: list of intensities values
+    returns a list of MzIntensityPairs()
+    """
+    assert len(mz_list) == len(intensity_list)
+    spectrum = []
+    for msms_mz, intensity in zip(mz_list, intensity_list):
+        spectrum.append(metob.MzIntensityPair())
+        spectrum[-1].mz = msms_mz
+        spectrum[-1].intensity = intensity
+    return spectrum
 
-    if 'inchi_key' not in df.columns:
-        df['inchi_key'] = ""
-    if 'adduct' not in df.columns:
-        df['adduct'] = ""
 
-    bad_names = check_compound_names(df)
-    if bad_names:
-        return bad_names
-    #Make sure all the files specified for references are actually there
-    #if 'file_rt' in df.keys():
-        #strip '.mzmL' from cells
-        #df.file_rt = df.file_rt.str.replace('\..+', '')
-        #bad_files = check_file_names(df,'file_rt')
-        #if bad_files:
-        #     return bad_files
-    #if 'file_mz' in df.keys():
-    #    #strip '.mzmL' from cells
-    #    df.file_mz = df.file_mz.str.replace('\..+', '')
-    #    bad_files = check_file_names(df,'file_mz')
-    #    if bad_files:
-    #         return bad_files
-    if 'file_msms' in list(df.keys()):
-        #strip '.mzmL' from cells
-        df.file_msms = df.file_msms.str.replace('\..+', '')
-        bad_files = check_file_names(df,'file_msms')
-        if bad_files:
-             return bad_files
-
-
-
-    all_identifications = []
-
-#     for i,row in df.iterrows():
-    for i,row in df.iterrows():
-        if type(row.inchi_key) != float or type(row.label) != float: #this logic is to skip empty rows
-
-            myID = metob.CompoundIdentification()
-
-            if (not pd.isnull(row.inchi_key)) and (len(row.inchi_key)>0): # this logic is where an identified metabolite has been specified
-                c = metob.retrieve('Compounds',inchi_key=row.inchi_key,username = '*') #currently, all copies of the molecule are returned.  The 0 is the most recent one.
-                if c:
-                    c = c[-1]
-            else:
-                c = 'use_label'
-            if type(row.label) != float:
-                compound_label = row.label #if no name, then use label as descriptor
-            else:
-                compound_label = 'no label'
+def get_atlas(name, atlas_df, polarity, mz_tolerance):
+    """
+    inputs:
+        name: string with name of atlas
+        atlas_df: pandas DataFrame with atlas definition
+        polarity: positive or negative
+        mz_tolerance: float to set for all mz_tolerance values
+    returns an Atlas object
 
-            if c:
-                if c != 'use_label':
-                    myID.compound = [c]
-                myID.name = compound_label
+    atlas_df should not contain empty strings, use np.NaN instead
+    """
+    atlas = metob.Atlas()
+    atlas.name = name
+    atlas.compound_identifications = []
+    for _, row in atlas_df.iterrows():
+        my_id = get_compound_identification(row, polarity, mz_tolerance)
+        if my_id is None:
+            logger.warning(('get_atlas() dropping compound %s '
+                            '(inchi_key %s) because it is not in the database.'), row.label, row.inchi_key)
+        else:
+            atlas.compound_identifications.append(my_id)
+    return atlas
 
-                try:
-                    myID.do_normalization = row.do_normalization
-                    myID.internal_standard_id = row.internal_standard_id
-                    myID.internal_standard_to_use = row.internal_standard_to_use
-                except:
-                    # no internal standard information was provided
-                    pass
 
-                try:
-                    myID.identification_notes = row.identification_notes
-                except:
-                    # no identification_notes were provided
-                    pass
+def make_atlas_from_spreadsheet(filename, atlas_name, filetype, sheetname=None,
+                                polarity=None, store=False, mz_tolerance=None):
+    '''
+    specify polarity as 'positive' or 'negative'
 
-                try:
-                    myID.ms1_notes = row.ms1_notes
-                except:
-                    # no ms1_notes were provided
-                    pass
+    '''
+    logging.debug('Generating atlas named %s from %s source.', atlas_name, filetype)
+    atlas_df = _get_dataframe(filename, filetype, sheetname)
+    _clean_dataframe(atlas_df, required_columns=['inchi_key', 'label'])
+    _add_columns(atlas_df, column_names=['adduct'], default_values=[np.NaN])
+    check_compound_names(atlas_df)
+    check_filenames(atlas_df, 'file_msms')
+    atlas = get_atlas(atlas_name, atlas_df, polarity, mz_tolerance)
+    if store:
+        logging.debug('Saving atlas named %s to DB.', atlas_name)
+        metob.store(atlas)
+    return atlas
 
+
+def _copy_attributes(source, dest, attribute_list, default_list=None, error_on_missing=False):
+    """
+    inputs:
+        source: object to copy attributes from
+        dest: object to copy attributes to
+        attribute_list: list of string containing attribute names
+        default_list: list of default values corresponding to same positions in attribute_list
+    Modifies dest in place to have all attributes from attribute_list with values coming from
+    source or default_list. If source does not contain the attribute and default_list is None,
+    then do not add the attribute to dest if it does not already exist.
+    """
+    if default_list is None:
+        for attribute in attribute_list:
+            if error_on_missing:
+                setattr(dest, attribute, getattr(source, attribute))
+            else:
                 try:
-                    myID.ms2_notes = row.ms2_notes
-                except:
-                    # no ms2_notes were provided
+                    setattr(dest, attribute, getattr(source, attribute))
+                except AttributeError:
                     pass
+    else:
+        for attribute, default in zip(attribute_list, default_list):
+            setattr(dest, attribute, getattr(source, attribute, default))
 
-                mzRef = metob.MzReference()
-                # take the mz value from the spreadsheet
-                mzRef.mz = row.mz
-                #TODO: calculate the mz from theoretical adduct and modification if provided.
-                #     mzRef.mz = c.MonoIso topic_molecular_weight + 1.007276
-                if mz_tolerance:
-                    mzRef.mz_tolerance = mz_tolerance
-                else:
-                    try:
-                        mzRef.mz_tolerance = row.mz_tolerance
-                    except:
-                        if 'mz_threshold' in df.columns:
-                            mzRef.mz_tolerance = row.mz_threshold
-                        else:
-                            sys.exit("mz_tolerance or mz_threshold not provided. Can't make atlas.")
-
-                mzRef.mz_tolerance_units = 'ppm'
-                mzRef.detected_polarity = polarity
-                #if 'file_mz' in df.keys():
-                #    f = metob.retrieve('Lcmsruns',name = '%%%s%%'%df.file_mz[x],username = '*')[0]
-                #    mzRef.lcms_run = f
-                if 'adduct' in row:
-                    if ~pd.isnull(row.adduct):
-                        mzRef.adduct = row.adduct
-
-                myID.mz_references = [mzRef]
-
-                rtRef = metob.RtReference()
-                rtRef.rt_units = 'min'
-                rtRef.rt_min = row.rt_min
-                rtRef.rt_max = row.rt_max
-                rtRef.rt_peak = row.rt_peak
-                #if 'file_rt' in df.keys():
-                #    f = metob.retrieve('Lcmsruns',name = '%%%s%%'%df.file_rt[x],username = '*')[0]
-                #    rtRef.lcms_run = f
-                myID.rt_references = [rtRef]
-
-                if ('file_msms' in list(df.keys())) and (c != 'use_label'):
-                    if (type(row.file_msms) != float) and (row.file_msms != ''):
-                        frag_ref = metob.FragmentationReference()
-                        f = metob.retrieve('Lcmsruns',name = '%%%s%%'%row.file_msms,username = '*')[0]
-                        frag_ref.lcms_run = f
-                        frag_ref.polarity = polarity
-                        frag_ref.precursor_mz = row.mz
-
-                        data = ma_data.get_data_for_a_compound(mzRef, rtRef, [ 'msms' ],f.hdf5_file,0.3)
-                        if isinstance(data['msms']['data'], np.ndarray):
-                            precursor_intensity = data['msms']['data']['precursor_intensity']
-                            idx_max = np.argwhere(precursor_intensity == np.max(precursor_intensity)).flatten()
-                            mz = data['msms']['data']['mz'][idx_max]
-                            intensity = data['msms']['data']['i'][idx_max]
-                            spectrum = []
-                            for i in range(len(mz)):
-                                mzp = metob.MzIntensityPair()
-                                mzp.mz = mz[i]
-                                mzp.intensity = intensity[i]
-                                spectrum.append(mzp)
-                            frag_ref.mz_intensities = spectrum
-                            myID.frag_references = [frag_ref]
-                            print('')
-                            print(('found reference msms spectrum for ',myID.compound[0].name, 'in file',row.file_msms))
-
-                all_identifications.append(myID)
-
-    myAtlas = metob.Atlas()
-    myAtlas.name = atlas_name
-    myAtlas.compound_identifications = all_identifications
-    if store:
-        metob.store(myAtlas)
-    return myAtlas
 
 def filter_empty_metatlas_objects(object_list,field):
     filtered_list = []
diff --git a/metatlas/tools/logging.py b/metatlas/tools/logging.py
new file mode 100644
index 00000000..11cc098b
--- /dev/null
+++ b/metatlas/tools/logging.py
@@ -0,0 +1,102 @@
+"""
+Logging configuration for Metatlas
+
+based on https://gist.github.com/joshbode/58fac7ababc700f51e2a9ecdebe563ad
+
+Usage:
+import logging
+from metatlas.tools.logging import activate_logging
+
+logger = logging.getLogger('metatlas.jupyter')
+activate_logging()
+"""
+
+import getpass
+import os
+import sys
+import logging
+from typing import Optional, Dict
+
+from colorama import Fore, Back, Style
+
+levels = {
+    "DEBUG": logging.DEBUG,
+    "INFO": logging.INFO,
+    "WARNING": logging.WARNING,
+    "ERROR": logging.ERROR,
+    "CRITICAL": logging.CRITICAL,
+}
+
+
+class ColoredFormatter(logging.Formatter):
+    """Colored log formatter."""
+
+    def __init__(self, *args, colors: Optional[Dict[str, str]] = None, **kwargs) -> None:
+        """Initialize the formatter with specified format strings."""
+
+        super().__init__(*args, **kwargs)
+
+        self.colors = colors if colors else {}
+
+    def format(self, record) -> str:
+        """Format the specified record as text."""
+
+        record.color = self.colors.get(record.levelname, "")
+        record.reset = Style.RESET_ALL
+
+        return super().format(record)
+
+
+def activate_logging(console_level="INFO", console_format=None, file_level="DEBUG", filename=None):
+    """
+    inputs:
+        console_level: string with desired logging level for messages on stdout (notebook)
+        file_level: string with desired logging level for message to log file
+        filename: file to send logs to
+    returns logger
+
+    Call this function to activate logging to console and file
+    valid logging levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    """
+    if console_format is None:
+        console_format = "{asctime} {color}{levelname:8}{reset} {message}"
+    console_formatter = ColoredFormatter(
+        console_format,
+        style="{",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        colors={
+            "DEBUG": Fore.CYAN,
+            "INFO": Fore.GREEN,
+            "WARNING": Fore.YELLOW,
+            "ERROR": Fore.RED,
+            "CRITICAL": Fore.WHITE + Back.RED,
+        },
+    )
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(console_formatter)
+    console_handler.setLevel(levels[console_level])
+
+    if filename is None:
+        if "METATLAS_LOCAL" in os.environ:
+            filename = "metatlas.log"
+        else:
+            filename = f"/global/cfs/projectdirs/m2650/jupyter_logs/{getpass.getuser()}.log"
+    file_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(name)s;%(message)s")
+    file_handler = logging.FileHandler(filename)
+    file_handler.setFormatter(file_formatter)
+    file_handler.setLevel(levels[file_level])
+
+    # stop jupyter from making its own root-level logger
+    # note that jupyter delays creating the root-level logger until a log message is generated
+    jupyter_logger = logging.getLogger()
+    jupyter_logger.handlers[:] = []
+    jupyter_logger.addHandler(logging.NullHandler())
+
+    logger = logging.getLogger("metatlas")
+    logger.handlers[:] = []
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    logger.setLevel(
+        levels[file_level] if levels[file_level] < levels[console_level] else levels[console_level]
+    )
+    return logger
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 69a98334..bacf1264 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -31,7 +31,7 @@
     "\n",
     "# experiment ID that must match the parent folder containing the LCMS output files\n",
     "# An example experiment ID is '20201116_JGI-AK_LH_506489_SoilWarm_final_QE-HF_HILICZ_USHXG01530'\n",
-    "experiment = 'Replace me'\n",
+    "experiment = 'REPLACE ME'\n",
     "\n",
     "# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']\n",
     "exclude_files = []\n",
@@ -40,6 +40,10 @@
     "# 'POS' or 'NEG' will be auto-appended later, so you shouldn't use them here.\n",
     "exclude_groups = ['QC','InjBl']\n",
     "\n",
+    "# thresholds for filtering out compounds with weak MS1 signals\n",
+    "num_points_passing = 5\n",
+    "peak_height_passing = 4e5\n",
+    "\n",
     "# include MSMS fragment ions in the output documents?\n",
     "export_msms_fragment_ions = False\n",
     "\n",
@@ -58,8 +62,6 @@
     "                          ('green','TxCtrl'),\n",
     "                          ('blue','InjBl')]\n",
     "\n",
-    "\n",
-    "\n",
     "# The rest of this block contains project independent parameters\n",
     "\n",
     "# Full path to the directory where you have cloned the metatlas git repo.\n",
@@ -78,7 +80,11 @@
     "\n",
     "# maximum number of CPUs to use\n",
     "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
-    "max_cpus = 4"
+    "max_cpus = 4\n",
+    "\n",
+    "# Threshold for how much status information metatlas functions print in the notebook\n",
+    "# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'\n",
+    "log_level = 'INFO'"
    ]
   },
   {
@@ -95,9 +101,9 @@
    "outputs": [],
    "source": [
     "%matplotlib notebook\n",
-    "%env HDF5_USE_FILE_LOCKING=FALSE\n",
     "\n",
     "import sys, os\n",
+    "os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'\n",
     "\n",
     "sys.path.insert(0, metatlas_repo_path)\n",
     "try:\n",
@@ -116,15 +122,19 @@
     "from metatlas.plots import dill2plots as dp\n",
     "from metatlas.io import metatlas_get_data_helper_fun as ma_data\n",
     "from metatlas.datastructures import metatlas_objects as metob\n",
-    "from pathlib import Path\n",
-    "from  IPython.core.display  import  display, HTML\n",
+    "from metatlas.datastructures import metatlas_dataset as mads\n",
+    "from metatlas.tools.logging import activate_logging\n",
+    "\n",
     "import getpass\n",
-    "import time\n",
-    "import pickle\n",
-    "import multiprocessing as mp\n",
-    "import pandas as pd\n",
+    "import logging\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import time\n",
+    "from functools import partial\n",
     "from importlib import reload\n",
+    "from pathlib import Path\n",
+    "from IPython.core.display import display, HTML\n",
     "\n",
     "if polarity not in ['positive', 'negative']:\n",
     "    raise ValueError('Parameter polarity is not one of \"positive\" or \"negative\".')\n",
@@ -141,30 +151,34 @@
     "    raise ValueError('Parameter experiment has not been set.')\n",
     "if len(experiment.split('_')) != 9:\n",
     "    raise ValueError('Parameter experiment does contain 9 fields when split on \"_\".')\n",
-    "    \n",
+    "\n",
+    "activate_logging(console_level=log_level)\n",
+    "logger = logging.getLogger('metatlas.jupyter')\n",
+    "\n",
     "username = getpass.getuser()\n",
     "analysis_id = f\"{username}{analysis_number}\"\n",
     "output_dir = os.path.join(project_directory, experiment, analysis_id, output_type)\n",
     "short_experiment_analysis_id = experiment.split('_')[0]+'_'+experiment.split('_')[3]+'_'+analysis_id\n",
     "\n",
-    "if not os.path.exists(project_directory):\n",
-    "    os.makedirs(project_directory)\n",
-    "if not os.path.exists(output_dir):\n",
-    "    os.makedirs(output_dir)\n",
+    "os.makedirs(project_directory, exist_ok=True)\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
     "\n",
     "# set notebook to have minimal side margins\n",
     "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
     "\n",
     "pd.set_option('display.max_rows', 5000)\n",
     "pd.set_option('display.max_columns', 500)\n",
-    "pd.set_option('display.max_colwidth', 100)"
+    "pd.set_option('display.max_colwidth', 100)\n",
+    "\n",
+    "logger.info(\"experiment=%s, analysis_id=%s, short_experiment_analysis_id=%s\", experiment, analysis_id, short_experiment_analysis_id)\n",
+    "logger.info(\"output_dir=%s\", output_dir)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 2. Create Groups (named variables that hold your replicates of each sample)\n",
+    "# LCMS filenaming convention\n",
     "\n",
     "### You must assign your raw files into experimental groups for analysis.  These are used for downstream statistics and for selection of specific groups for filtering to subsets of files for analysis (Ex. just pos or just neg).\n",
     "\n",
@@ -176,13 +190,7 @@
     "***\n",
     "The common header consists of the fields 0-10: DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ \n",
     "\n",
-    "The sample group name is commonly field # 12 (between underscore 11 and 12) -0 indexed-\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
+    "The sample group name is commonly field # 12 (between underscore 11 and 12) -0 indexed-\n",
     "# Find your files\n",
     "1. On the first line of the block below, set the 'experiment' and 'name' variables to find your files.  These fields require wildcards for partial string searches\n",
     "2. 'Experiment' is the folder name within global/project/projectdirs/metatlas/raw_data, that will be emailed to you when the files are uploaded to NERSC.  You can also look in the raw_data directory for the NERSC user who uploaded your files; your experiment folder should be in there.\n",
@@ -197,7 +205,7 @@
    "source": [
     "files = dp.get_metatlas_files(experiment = experiment,name = '%',most_recent = True)\n",
     "df = metob.to_dataframe(files)\n",
-    "print(f\"Number of LCMS output files matching '{experiment}' is: {len(files)}.\")\n",
+    "logger.info(\"Number of LCMS output files matching '%s' is: %d.\", experiment, len(files))\n",
     "df.head()"
    ]
   },
@@ -205,13 +213,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# OPTION A: Automated Group Maker"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
+    "# Make Groups\n",
     "This will attempt to create groups in an automated fashion (rather than filling out a spreadsheet with a list of files and group names).  If your files are all in one folder at nersc, you can use this options.  If not, use option B below.\n",
     "\n",
     "A long group name consisting of the common header + either controlled vocab value or field #12 along with a short group name (just controlled vocab or field #12) will be stored in a local variable.  The short group names can be used on plots.\n",
@@ -271,7 +273,6 @@
    "outputs": [],
    "source": [
     "#STEP 2: create the groups variable, if the above looks OK\n",
-    "\n",
     "groups = []\n",
     "for group_key,group_values in groups_dict.items():\n",
     "    g = metob.Group(name=group_key,items=group_values['items'],short_name=group_values['short_name'])\n",
@@ -291,95 +292,6 @@
     "metob.store(groups)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## STEP 3 Option B-I: OPTIONAL: Export groups to csv file for editing (filename, short_name, group, description)\n",
-    "#dp.make_prefilled_fileinfo_sheet(groups,os.path.join(output_dir,'prefilled_fileinfo.tab'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## STEP 3 Option B-II: Import groups from csv file after editing the prefilled_fileinfo.tab\n",
-    "#groups = dp.make_groups_from_fileinfo_sheet(os.path.join(output_dir,'prefilled_fileinfo.tab'), filetype='tab', store=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# OPTION B: Register LCMS Runs into categorical groups from a file.\n",
-    "\n",
-    "Typically, you will make one fileinfo sheet with all of your files (pos and neg) for this experiment.  At a minimum, group names MUST contain the first 11 underscore delimited fields (DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ) and the 'SAMPLEGROUP' field.\n",
-    "\n",
-    "Files can be from multiple folders at nersc.\n",
-    "\n",
-    "1. STEP 1: select files\n",
-    "    1. Edit the experiment and name fields to find the files you want.\n",
-    "2. STEP 2: create and save a .tab file to your project directory.\n",
-    "    1. After running the block, find the .tab file in your project directory.\n",
-    "    2. Open in excel or other spreadsheet editor.\n",
-    "    3. Fill out the group names as per above in the editor.\n",
-    "    4. Save the file as filled_fileinfo.txt\n",
-    "3. STEP 3: Create groups from spreadsheet\n",
-    "    1. Transfer the .txt. file back to your project directory \n",
-    "    2. Run the make groups block using store=False\n",
-    "4. STEP 4: CHECK groups\n",
-    "    1. Run the next block 'metob.to_dataframe(g) and check that the information looks correct\n",
-    "5. If it is correct, rerun the STEP 2 make groups block, using store=True.  If not, fix your file in excel and redo Steps 2&3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# #STEP 1: Select files\n",
-    "# files = dp.get_metatlas_files(experiment =experiment,name = '%',most_recent = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# #STEP 2: Save spreadsheet file\n",
-    "# dp.make_empty_fileinfo_sheet('%s%s' % (output_dir,'empty_fileinfo.tab'),files)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# #STEP 3: create groups from file\n",
-    "# g = dp.make_groups_from_fileinfo_sheet('%s%s' % (output_dir,'filled_fileinfo.txt'),\n",
-    "#                                        filetype='tab',\n",
-    "#                                        store=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# # STEP 4: check groups\n",
-    "# metob.to_dataframe(g)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -424,21 +336,11 @@
     "short_names_df.to_csv(os.path.join(output_dir, 'short_names.csv'), sep=',', index=True)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# # Optional import edited short_names.csv \n",
-    "# short_names_df = pd.read_csv(os.path.join(output_dir, 'short_names.csv'), sep=',', index_col='full_filename')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 3. Select groups of files to operate on\n",
+    "# Select groups of files to operate on\n",
     "\n",
     "Here, you will assign your database groups to a local variable which will be used downstream in the notebook for analyzing your data with an atlas.\n",
     "\n",
@@ -463,16 +365,7 @@
     "print(\"sorted groups\")\n",
     "groups = sorted(groups, key=lambda x: x.name)\n",
     "for i,a in enumerate(groups):\n",
-    "    print(i, a.name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# to view metadata about your groups, run the block below\n",
+    "    print(i, a.name)\n",
     "metob.to_dataframe(groups)"
    ]
   },
@@ -480,61 +373,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 4. Create new Atlas entries in the Metatlas database from a csv file\n",
-    "\n",
-    "## QC, IS, and EMA template atlases are available on the google drive.\n",
-    "\n",
-    "1. Create your atlas as a csv file, check that it looks correct (has all the correct headers and no blank values in rows; all columns are the correct data type\n",
-    "2. Save it with the type of atlas (EMA, QC or IS), your initials, the experiment name, the polarity, and the version or timestamp\n",
-    "3. Upload it to your nersc project directory (the one you named above).  (If it doesn't work, double check your file permissions are set to at least rw-rw----).\n",
-    "4. Run blocks below to create the DB entries for negative and positive mode atlases\n",
-    "5. WARNING: Don't run this block over and over again - it will create multiple new DB entries with the same atlas name\n",
-    "\n",
-    "Required Atlas file headers:\n",
-    "\n",
-    "inchi_key,label,rt_min,rt_max,rt_peak,mz,mz_tolerance,adduct,polarity,identification_notes\n",
-    "\n",
-    "values in rows must be completed for all fields except inchi_key (leaving this blank will not allow you to perform MSMS matching below), and identification notes\n",
-    "\n",
-    "INFO: store=True will register your atlas in the database.  If you are not sure if your atlas structure is correct, set store=False for the first time your run the block to check if you get an error.  If there is no error, then rerun it with store=True."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ATLAS UPLOAD"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "atlas_file_name = '' # <- enter the exact name of your csv file including the full path and .csv file extension\n",
-    "\n",
-    "if atlas_file_name != '':\n",
-    "    names = dp.make_atlas_from_spreadsheet(atlas_file_name,\n",
-    "                                           Path(atlas_file_name).stem,\n",
-    "                                           filetype='csv',\n",
-    "                                           sheetname='',\n",
-    "                                           polarity = polarity,\n",
-    "                                           store=True,\n",
-    "                                           mz_tolerance = 12\n",
-    "                                          )   "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 5. Select Atlas to use\n",
+    "# Select Atlas to use\n",
     "\n",
     "1. The first block will retrieve a list of atlases matching the 'name' string that you enter.  Also, you must enter your username.\n",
-    "2. The next block will select one from the list, using the index number.  Make sure to enter the index number for the atlas you want to use for your analysis by setting in this line: my_atlas = atlases[0]"
+    "2. The next block will select one from the list, using the index number.  Make sure to enter the index number for the atlas you want to use for your analysis by setting in this line: atlas_idx = 0"
    ]
   },
   {
@@ -555,182 +397,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "my_atlas = atlases[-1]\n",
-    "atlas_df = ma_data.make_atlas_df(my_atlas)\n",
-    "atlas_df['label'] = [cid.name for cid in my_atlas.compound_identifications]\n",
-    "print(my_atlas.name)\n",
-    "metob.to_dataframe([my_atlas])\n",
-    "# the first line of the output will show the dimensions of the atlas dataframe"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 6. Get EICs and MSMS for all files in your groups, using all compounds in your atlas."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### This block builds the metatlas_dataset variable.  This holds your EIC data (mz, rt, intensity values within your mz and rt ranges).\n",
-    "\n",
-    "The EIC data contains mz, intensity and RT values across your RT range.  There are two parameters that you will need to edit: extra_time and extra_mz.  Extra time will collect mz, intensity and RT values from outside of your atlas defined min and max rt values.  For example if your rt_min = 1.0, and rt_max = 2.0 and you set extra_time to 0.3, then your new rt range will be 0.7 to 2.3.  This is helpful for checking if you have nearby peaks at the same m/z.  Extra_mz should only be used for troubleshooting.  You should keep this at 0 unless you believe you have poor mass accuracy during your run.  Other ways to address this issue is by changing the mz_tolerance values in your atlas.  Before changing this value, you should check in with a metatlas experienced lab member to discuss when/how to use this value.\n",
-    "\n",
-    "1. Change the value in \"extra_time = 0.0\" to something like 0.5 to 1.0 for the first EMA runthrough on your files.  This will take longer but collect msms outside your retention windows which allows you to check the msms of nearby peaks before adjusting your rt bounds around the correct peak.\n",
-    "2. extra_mz should almost always be set to 0.0   If you need to troubleshoot a low mz compound you could potentially use this value to run it back through with a larger mz error window than what was specified in your atlas (ppm tolerance).\n",
-    "\n",
-    ">On Your final runthrough, set extra_time to 0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "all_files = []\n",
-    "for my_group in groups:\n",
-    "    for my_file in my_group.items:\n",
-    "        extra_time = 0.75          # NOTE: 0.75 for the first run, 0.5 for final \n",
-    "        extra_mz = 0.00\n",
-    "        all_files.append((my_file,my_group,atlas_df,my_atlas,extra_time,extra_mz))\n",
-    "pool = mp.Pool(processes=min(max_cpus, len(all_files)))\n",
-    "t0 = time.time()\n",
-    "metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)\n",
-    "pool.close()\n",
-    "pool.terminate()\n",
-    "print(time.time() - t0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Make data sources tables (atlas_metadata.tab, groups_metadata.tab, groups.tab and [atlasname]_originalatlas.tab within data_sources subfolder)\n",
-    "ma_data.make_data_sources_tables(groups, my_atlas, output_dir) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 6b Optional: Filter atlas for compounds with no or low signals\n",
-    "\n",
-    "Uncomment the below 3 blocks to filter the atlas.\n",
-    "Please ensure that correct polarity is used for the atlases."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# dp = reload(dp)\n",
-    "# num_data_points_passing = 5\n",
-    "# peak_height_passing = 4e5\n",
-    "# atlas_df_passing = dp.filter_atlas(atlas_df=atlas_df, input_dataset=metatlas_dataset, num_data_points_passing = num_data_points_passing, peak_height_passing = peak_height_passing)\n",
-    "# print(\"# Compounds in Atlas: \"+str(len(atlas_df)))\n",
-    "# print(\"# Compounds passing filter: \"+str(len(atlas_df_passing)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Create new atlas and store in database\n",
-    "This block creates a filtered atlas with a new name !!\n",
-    "Automatically selects this atlas for processing. \n",
-    "Make sure to use this atlas for downstream analyses. (NOTE: If you restart kernel or come back to the analysis, you need to reselect this newly created filtered atlas for processing)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# atlas_passing = my_atlas.name+'_filteredby-datapnts'+str(num_data_points_passing)+'-pkht'+str(peak_height_passing)\n",
-    "# myAtlas_passing = dp.make_atlas_from_spreadsheet(atlas_df_passing,\n",
-    "#                           atlas_passing,\n",
-    "#                           filetype='dataframe',\n",
-    "#                           sheetname='',\n",
-    "#                           polarity = polarity,\n",
-    "#                           store=True,\n",
-    "#                           mz_tolerance = 12)\n",
-    "\n",
-    "# atlases = dp.get_metatlas_atlas(name=atlas_passing,do_print = True, most_recent=True)\n",
-    "\n",
-    "# myAtlas = atlases[-1]\n",
-    "# atlas_df = ma_data.make_atlas_df(myAtlas)\n",
-    "# atlas_df['label'] = [cid.name for cid in myAtlas.compound_identifications]\n",
-    "# print(myAtlas.name)\n",
-    "# print(myAtlas.username)\n",
-    "# metob.to_dataframe([myAtlas])# "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# all_files = []\n",
-    "# for my_group in groups:\n",
-    "#     for my_file in my_group.items:\n",
-    "#         all_files.append((my_file,my_group,atlas_df,myAtlas))\n",
-    "        \n",
-    "# pool = mp.Pool(processes=min(max_cpus, len(all_files)))\n",
-    "# t0 = time.time()\n",
-    "# metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)\n",
-    "# pool.close()\n",
-    "# pool.terminate()\n",
-    "# #If you're code crashes here, make sure to terminate any processes left open.\n",
-    "#(print time.time() - t0)"
+    "atlas_idx = 0\n",
+    "metatlas_dataset = mads.MetatlasDataset(atlases[atlas_idx], groups, max_cpus=max_cpus)\n",
+    "ma_data.make_data_sources_tables(groups, metatlas_dataset.atlas, output_dir) "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### One of the two blocks below builds the hits variable.  This holds your MSMS spectra (from within your mz, and rt ranges, and within the extra time indicated above).\n",
-    "\n",
-    "There are two options for generating the hits variable:\n",
-    "1. block A: use when your files have msms. It create the hits variable and also saves a binary (pickled) serialized hits file to the output directory.\n",
-    "2. block B: only run if your files were collected in MS1 mode\n",
-    "3. If you have already run block A and then the kernel dies, you can skip block A and directly unplickle the binary hits file from the output directory. Skip block A, uncomment the Optional block and run it. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "##BLOCK A\n",
-    "import warnings; warnings.simplefilter('ignore')\n",
-    "t0 = time.time()\n",
-    "\n",
-    "hits=dp.get_msms_hits(metatlas_dataset,extra_time=True,keep_nonmatches=True, frag_mz_tolerance=0.01, ref_loc='/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab')\n",
-    "pickle.dump(hits, open(os.path.join(output_dir,polarity[:3].upper()+'_hits.pkl'), \"wb\"))\n",
-    "\n",
-    "print(time.time() - t0)\n",
-    "print('%s%s' % (len(hits),' <- total number of MSMS spectra found in your files'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## BLOCK B (uncomment lines below to run this.  Only use when all data files are MS1)\n",
-    "#hits=pd.DataFrame([], columns=['database','id','file_name','msms_scan', u'score', u'num_matches', u'msv_query_aligned', u'msv_ref_aligned', u'name', u'adduct', u'inchi_key', u'precursor_mz', u'measured_precursor_mz'])\n",
-    "#hits.set_index(['database','id','file_name','msms_scan'], inplace=True)"
+    "# Optional: Filter atlas for compounds with no or low signals"
    ]
   },
   {
@@ -739,54 +415,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Optional: If you already have a pickled hits file and do not need to run get_msms_hits again, uncomment this block\n",
-    "# hits = pickle.load(open(os.path.join(output_dir,polarity[:3].upper()+'_hits.pkl'), \"rb\")) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 7. Adjust Retention Times. \n",
-    "\n",
-    "This block creates an interactive plot.  The top panel displays MSMS from within the two green RT bounds selected below (rt min and max, initially set in atlas).  When the database holds reference spectra, mirror plots are generated with the reference spectra inverted below the sample spectra.  The lower panel displays the EICs overlayed for all of the files in your selected groups.  You can highlight your groups different colors.  It is recommended that you do this, at least, for your extraction blank (or if not available, use a solvent injection blank).  This plot also displays radio buttons that can be interactively selected; the values will be exported in your final identifications table and in your atlas export.  Use these to mark peak/MSMS quality.\n",
-    "\n",
-    "How to use:\n",
-    "1. STEP 1: Set peak flag radio buttons\n",
-    "    1. OPTION A (custom flags): fill out the peak flags list (list of strings) \n",
-    "        peak_flag_list = ('A','B') some recommendations are below.  \n",
-    "    2. OPTION B (default flags): comment out the custom peak_flag_list line.  Uncomment the default peak_flags = \"\". \n",
-    "        Flags default to: keep, remove, unresolvable isomers, check.\n",
-    "2. STEP 2: Set EIC colors\n",
-    "    1. Option A (custom EIC colors): fill out the colorlist in the format of below\n",
-    "   > ***\n",
-    "   > colorlist = [['color1nameorhexadec','partialgroupstring1'],\n",
-    "   >         ['color2nameorhexadec','partialgroupstring2']]\n",
-    "   > ***\n",
-    "    <ul><li>You can add more comma delimited colors/groups as needed.</li>\n",
-    "    <li>These are partial strings that match to you file names (not your group names).</li>\n",
-    "    <li>The order they are listed in your list is the order they are displayed in the overlays (first is front, last is back)</li>\n",
-    "    <li>Named colors available in matplotlib are here: https://matplotlib.org/3.1.0/gallery/color/named_colors.html\n",
-    "          or use hexadecimal values '#000000'</li></ul>\n",
-    "    B. Option B (default EIC colors): comment out the custom colorlist lines and uncomment the default colorlist = \"\". \n",
-    "        Colors all default to black.\n",
-    "3. User the right/left buttons on your keyboard to cycle through compounds in your atlas.\n",
-    "4. Use the up/down buttons on your keyboard to cycle through MSMS spectra within the RT bounds of the lower plot.\n",
-    "5. Use the horizontal rt min and rt max bars below the plots to adjust the rt bounds around your peak.  If there are multiple peaks, select one at a time and then click up/down to update the msms available in that new RT range.  If necessary evaluate your data in an external program such as mzmine to make sure you are selecting the correct peak.\n",
-    "\n",
-    "TIPS: use compound_idx = 0 in step 3 to change to a different compound in your atlas using the index number.  If your plot does not fit in your browser window, adjust height and width values.  Use alpha to change the transparency of the lines this is a value 0 (transparent) to 1 (opaque).\n",
-    "\n",
-    "DO NOT change your RT theoretical peak (the purple line).  It is locked from editing (unless you change a hidden parameter) and only to be changed in special cases.  The measured retention times of your peaks will be calculated and exported in your output files.  These will be compared with the RT theoreticals and used in your evidence of identification table."
+    "metatlas_dataset.filter_compounds_by_signal(num_points=num_points_passing, peak_height=peak_height_passing)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "import warnings; warnings.simplefilter('ignore')\n",
-    "a = dp.adjust_rt_for_selected_compound(metatlas_dataset, msms_hits=hits,\n",
+    "a = dp.adjust_rt_for_selected_compound(metatlas_dataset, msms_hits=metatlas_dataset.hits,\n",
     "                                       color_me=rt_adjuster_color_list,\n",
     "                                       compound_idx=0, alpha=0.5, width=18, height=3)"
    ]
@@ -795,25 +435,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 8. Create filtered atlas excluding compounds marked removed\n",
-    "\n",
-    "Re-run the following before filtering atlas\n",
-    "1. Get Groups (include InjBl)\n",
-    "2. Get Atlas\n",
-    "3. Get Data\n",
-    "4. Get MSMS Hits"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(atlas_kept, atlas_removed) = dp.filter_by_remove(atlas_df, metatlas_dataset)\n",
-    "print(\"# Compounds Total: \"+str(len(atlas_df)))\n",
-    "print(\"# Compounds Kept: \"+str(len(atlas_kept)))\n",
-    "print(\"# Compounds Removed: \"+str(len(atlas_removed)))"
+    "# Export results files\n",
+    "### Filter out compounds with ms1_notes of 'remove'"
    ]
   },
   {
@@ -822,35 +445,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "atlasfilename=my_atlas.name+'_kept'  # <- enter the name of the atlas to be stored\n",
-    "\n",
-    "names = dp.make_atlas_from_spreadsheet(atlas_kept, \n",
-    "                                       atlasfilename,  # <- DO NOT EDIT THIS LINE\n",
-    "                                       filetype='dataframe',\n",
-    "                                       sheetname='',\n",
-    "                                       polarity = polarity,\n",
-    "                                       store=True,\n",
-    "                                       mz_tolerance = 12\n",
-    "                                      )   "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Re-run the following before filtering atlas\n",
-    "1. Restart kernel\n",
-    "2. Get Groups\n",
-    "3. Get Atlas (look for the *_kept atlas)\n",
-    "4. Get Data\n",
-    "5. Get MSMS Hits"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 9. Export results files"
+    "metatlas_dataset.filter_compounds_ms1_notes_remove()"
    ]
   },
   {
@@ -868,9 +463,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "export_atlas_filename = os.path.join(output_dir, f\"{polarity[:3].upper()}_{my_atlas.name}_export\")\n",
-    "atlas_identifications = dp.export_atlas_to_spreadsheet(my_atlas, export_atlas_filename)\n",
-    "export_atlas_filename"
+    "export_atlas_filename = os.path.join(output_dir, f\"{polarity[:3].upper()}_{metatlas_dataset.atlas.name}_export\")\n",
+    "atlas_identifications = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)\n",
+    "logger.info(\"Exported atlas to file: %s.\", export_atlas_filename)"
    ]
   },
   {
@@ -900,13 +495,13 @@
     "          'mz_tolerance': 20,      # strict = 5, loose = 25; >= ppm of median mz across all files for given compound relative to reference\n",
     "          'min_msms_score': .6, 'allow_no_msms': True,     # strict = 0.6, loose = 0.3 <= highest compound dot-product score across all files for given compound relative to reference\n",
     "          'min_num_frag_matches': 1, 'min_relative_frag_intensity': .001}   # strict = 3 and 0.1, loose = 1, 0.01 number of matching mzs when calculating max_msms_score and ratio of second highest to first highest intensity of matching sample mzs\n",
-    "scores_df = fa.make_scores_df(metatlas_dataset,hits)\n",
+    "scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)\n",
     "scores_df['passing'] = fa.test_scores_df(scores_df, **kwargs)\n",
     "\n",
-    "pass_atlas_df, fail_atlas_df, pass_dataset, fail_dataset = fa.filter_atlas_and_dataset(scores_df, atlas_df, metatlas_dataset, column='passing')\n",
+    "pass_atlas_df, fail_atlas_df, pass_dataset, fail_dataset = fa.filter_atlas_and_dataset(scores_df, metatlas_dataset.atlas_df, metatlas_dataset, column='passing')\n",
     "\n",
-    "fa.make_stats_table(input_dataset = metatlas_dataset, msms_hits = hits, output_loc = output_dir,min_peak_height=1e5,use_labels=True,min_msms_score=0.01,min_num_frag_matches=1,include_lcmsruns = [],exclude_lcmsruns = ['QC'], polarity=polarity[:3].upper())\n",
-    "scores_df.to_csv(os.path.join(output_dir,'stats_tables',polarity[:3].upper()+'_compound_scores.csv'))"
+    "fa.make_stats_table(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, output_loc=output_dir, min_peak_height=1e5, use_labels=True, min_msms_score=0.01, min_num_frag_matches=1, include_lcmsruns=[], exclude_lcmsruns=['QC'], polarity=polarity[:3].upper())\n",
+    "scores_df.to_csv(os.path.join(output_dir,'stats_tables', polarity[:3].upper()+'_compound_scores.csv'))"
    ]
   },
   {
@@ -935,7 +530,7 @@
     "save = True\n",
     "share_y = True\n",
     "\n",
-    "dp.make_chromatograms(input_dataset=metatlas_dataset, include_lcmsruns = [],exclude_lcmsruns = ['InjBl','QC','Blank','blank'], group=group, share_y=share_y, save=save, output_loc=output_dir, short_names_df=short_names_df, short_names_header='short_samplename', polarity=polarity[:3].upper())"
+    "dp.make_chromatograms(input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=['InjBl','QC','Blank','blank'], group=group, share_y=share_y, save=save, output_loc=output_dir, short_names_df=short_names_df, short_names_header='short_samplename', polarity=polarity[:3].upper())"
    ]
   },
   {
@@ -957,7 +552,7 @@
    },
    "outputs": [],
    "source": [
-    "dp.make_identification_figure_v2(input_dataset = metatlas_dataset, msms_hits=hits, use_labels=True, include_lcmsruns = [],exclude_lcmsruns = ['InjBl','QC','Blank','blank'], output_loc=output_dir,  short_names_df=short_names_df, polarity=polarity[:3].upper())"
+    "dp.make_identification_figure_v2(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, use_labels=True, include_lcmsruns=[], exclude_lcmsruns=['InjBl', 'QC', 'Blank', 'blank'], output_loc=output_dir,  short_names_df=short_names_df, polarity=polarity[:3].upper())"
    ]
   },
   {
@@ -975,12 +570,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "peak_height = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "peak_area = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "mz_peak = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "rt_peak = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "mz_centroid = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "rt_centroid = dp.make_output_dataframe(input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid', output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)"
+    "output_dataframe = partial(dp.make_output_dataframe, input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=[], output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
+    "peak_height = output_dataframe(fieldname='peak_height')\n",
+    "peak_area = output_dataframe(fieldname='peak_area')\n",
+    "mz_peak = output_dataframe(fieldname='mz_peak')\n",
+    "rt_peak = output_dataframe(fieldname='rt_peak')\n",
+    "mz_centroid = output_dataframe(fieldname='mz_centroid')\n",
+    "rt_centroid = output_dataframe(fieldname='rt_centroid')"
    ]
   },
   {
@@ -1086,7 +682,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.2"
+   "version": "3.9.4"
   }
  },
  "nbformat": 4,
diff --git a/noxfile.py b/noxfile.py
index 2dde9d06..19d55e18 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -19,6 +19,7 @@
 # has not yet been updated to pass all checks.
 more_checks = [
     'metatlas/datastructures/metatlas_dataset.py',
+    'metatlas/tools/logging.py',
     'tests'
     ]
 
diff --git a/tests/fixtures/metatlas_dataset_fixtures.py b/tests/fixtures/metatlas_dataset_fixtures.py
index a990a756..8d228141 100644
--- a/tests/fixtures/metatlas_dataset_fixtures.py
+++ b/tests/fixtures/metatlas_dataset_fixtures.py
@@ -1,10 +1,20 @@
 import pytest
 import numpy as np
 
+from metatlas.datastructures import metatlas_dataset as mads
+
 
 @pytest.fixture
 def ms1_summary():
-    return "{'num_ms1_datapoints': 85.0, 'mz_peak': 252.1092987060547, 'rt_peak': 2.2775044441223145, 'mz_centroid': 252.10915042669814, 'rt_centroid': 2.218492414487913, 'peak_height': 304761.90625, 'peak_area': 7696977.46875}"  # noqa: E501
+    return {
+        "num_ms1_datapoints": 85.0,
+        "mz_peak": 252.1092987060547,
+        "rt_peak": 2.2775044441223145,
+        "mz_centroid": 252.10915042669814,
+        "rt_centroid": 2.218492414487913,
+        "peak_height": 304761.90625,
+        "peak_area": 7696977.46875,
+    }
 
 
 @pytest.fixture
@@ -23,7 +33,7 @@ def msms():
 
 
 @pytest.fixture
-def metatlas_dataset(lcmsrun, group, compound_identification):
+def metatlas_dataset(lcmsrun, group, compound_identification, eic, ms1_summary, msms):
     return [
         [
             {
@@ -305,4 +315,9 @@ def eic():
             21777.99609375,
             59454.40234375,
         ],
-    }  # noqa: E501
+    }
+
+
+@pytest.fixture
+def atlas_df(atlas, group):
+    return mads.MetatlasDataset(atlas, [group]).atlas_df
diff --git a/tests/fixtures/metatlas_object_fixtures.py b/tests/fixtures/metatlas_object_fixtures.py
index 95769005..86128b0a 100644
--- a/tests/fixtures/metatlas_object_fixtures.py
+++ b/tests/fixtures/metatlas_object_fixtures.py
@@ -25,9 +25,7 @@ def compound():
     compound.iupac_name = ""
     compound.username = "wjholtz"
     compound.pubchem_compound_id = "13730"
-    compound.description = (
-        "A purine 2'-deoxyribonucleoside having adenine as the nucleobase."
-    )
+    compound.description = "A purine 2'-deoxyribonucleoside having adenine as the nucleobase."
     compound.metacyc_id = "DEOXYADENOSINE"
     compound.kegg_id = "C00559"
     compound.hmdb_id = "HMDB00101"
@@ -148,9 +146,7 @@ def lcmsrun():
     run.method = None
     run.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
     run.head_id = "7ce51039cfca4426b4e51999ac45d018"
-    run.experiment = (
-        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    )
+    run.experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     run.injection_volume = 0.0
     run.injection_volume_units = "uL"
     run.acquisition_time = 1604770080
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index 8d4e7755..edd3a452 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -1,74 +1,11 @@
 import os
-import pytest
 import subprocess
 
 
-def test_targeted_by_line01(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.1.0"
+def test_targeted_by_line01_with_remove(tmp_path):
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    out_file = (
-        tmp_path
-        / experiment
-        / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
-    )
-    expected = [
-        f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
-        f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
-        "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
-        "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
-        "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
-        "0001_adenine_positive_M+H136p0618_2p52\t1645281.875\t12096485.0\t51774956.0\t91955488.0",
-        "0002_adenine_positive_M+H136p0618_2p52\t1880780.125\t12096485.0\t51774956.0\t91955488.0",
-        "0003_xanthine_positive_M+H153p0407_2p70\t72926.875\t60128.625\t231272.640625\t317968.03125",
-        "0004_4-pyridoxic_acid_positive_M+H184p0605_2p84\t44113.671875\t66073.203125\t94702.390625\t214180.296875",  # noqa: E501
-        "0005_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
-        "",
-    ]
-    subprocess.run(
-        [
-            "docker",
-            "run",
-            "--rm",
-            "-v",
-            f"{os.getcwd()}:/src",
-            "-v",
-            f"{tmp_path}:/out",
-            image,
-            "papermill",
-            "-p",
-            "experiment",
-            experiment,
-            "-p",
-            "metatlas_repo_path",
-            "/src",
-            "-p",
-            "project_directory",
-            "/out",
-            "-p",
-            "max_cpus",
-            "2",
-            "/src/notebooks/reference/Targeted.ipynb",
-            "/out/Targeted.ipynb",
-        ],
-        check=True,
-    )
-    with open(out_file, "r") as handle:
-        for num, line in enumerate(handle.readlines()):
-            clean_line = line.rstrip("\n")
-            assert expected[num] == clean_line
-
-
-@pytest.mark.xfail
-def test_targeted_by_line02_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.1.0"
-    experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    out_file = (
-        tmp_path
-        / experiment
-        / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
-    )
+    out_file = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
     expected = [
         f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
         f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
@@ -77,8 +14,8 @@ def test_targeted_by_line02_with_remove(tmp_path):
         "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
         "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
         "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
-        "0002_adenine_positive_M+H136p0618_2p52\t1880780.125\t12096485.0\t51774956.0\t91955488.0",
-        "0005_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
+        "0001_adenine_positive_M+H136p0618_2p52\t1880780.125\t12096485.0\t51774956.0\t91955488.0",
+        "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
         "",
     ]
     subprocess.run(
@@ -96,10 +33,20 @@ def test_targeted_by_line02_with_remove(tmp_path):
             """\
                     jq -M '(.cells[] | select(.source[] | contains("compound_idx=0")).source) \
                                += ["\\n", \
-                                    "remove_idxs = [1, 3, 4]\\n", \
-                                   "for idx in remove_idxs:\\n", \
-                                   "    a.compound_idx = idx\\n", \
-                                   "    a.set_peak_flag(\\"remove\\")" \
+                                   "a.compound_idx = 0\\n", \
+                                   "a.set_msms_flag(\\"1, co-isolated precursor but all reference ions are in sample spectrum\\")\\n", \
+                                   "a.data.set_rt(0, \\"rt_min\\", 2.1245)\\n", \
+                                   "a.data.set_rt(0, \\"rt_max\\", 2.4439)\\n", \
+                                   "a.compound_idx = 1\\n", \
+                                   "a.set_peak_flag(\\"remove\\")\\n", \
+                                   "a.compound_idx = 2\\n", \
+                                   "a.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
+                                   "a.data.set_rt(2, \\"rt_min\\", 2.4361)\\n", \
+                                   "a.data.set_rt(2, \\"rt_max\\", 2.8608)\\n", \
+                                   "a.compound_idx = 3\\n", \
+                                   "a.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
+                                   "a.data.set_rt(3, \\"rt_min\\", 2.8428)\\n", \
+                                   "a.data.set_rt(3, \\"rt_max\\", 3.3081)\\n" \
                                   ]' /src/notebooks/reference/Targeted.ipynb > /out/Remove.ipynb &&  \
                     papermill \
                         -p experiment 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 \
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 59d695d2..5c4757c3 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -10,6 +10,4 @@ def refactor(string: str) -> str:
     return string.replace("/", ".").replace("\\", ".").replace(".py", "")
 
 
-pytest_plugins = [
-    refactor(fixture) for fixture in glob("tests/fixtures/*.py") if "__" not in fixture
-]
+pytest_plugins = [refactor(fixture) for fixture in glob("tests/fixtures/*.py") if "__" not in fixture]
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index bcf32320..7ad020ff 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -65,9 +65,7 @@ def test_remove_metatlas_objects_by_list_remove_none():
     i.myattr = [1, 2]
     j = type("", (), {})()
     j.myattr = [2, 3]
-    assert [i, j] == dill2plots.remove_metatlas_objects_by_list(
-        [i, j], "myattr", [4, 5]
-    )
+    assert [i, j] == dill2plots.remove_metatlas_objects_by_list([i, j], "myattr", [4, 5])
 
 
 def test_remove_metatlas_objects_by_list_remove_all():
@@ -81,6 +79,18 @@ def test_remove_metatlas_objects_by_list_remove_all():
 def test_export_atlas_to_spreadsheet(atlas):
     # pylint: disable=line-too-long
     expected = """{"chebi_id":{"0":"CHEBI:17256"},"chebi_url":{"0":"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"},"creation_time":{"0":1466212395.0},"description":{"0":"A purine 2'-deoxyribonucleoside having adenine as the nucleobase."},"formula":{"0":"C10H13N5O3"},"head_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"hmdb_id":{"0":"HMDB00101"},"hmdb_url":{"0":"http://www.hmdb.ca/metabolites/HMDB00101"},"img_abc_id":{"0":""},"inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"iupac_name":{"0":""},"kegg_id":{"0":"C00559"},"kegg_url":{"0":"http://www.genome.jp/dbget-bin/www_bget?C00559"},"last_modified":{"0":1612996604.0},"lipidmaps_id":{"0":""},"lipidmaps_url":{"0":""},"metacyc_id":{"0":"DEOXYADENOSINE"},"mono_isotopic_molecular_weight":{"0":251.101839276},"name":{"0":"2'-deoxyadenosine"},"neutralized_2d_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"},"neutralized_2d_inchi_key":{"0":"OLXZPDWKRNYJJZ-UHFFFAOYSA-N"},"neutralized_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"neutralized_inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"num_free_radicals":{"0":0.0},"number_components":{"0":1.0},"permanent_charge":{"0":0.0},"prev_uid":{"0":"origin"},"pubchem_compound_id":{"0":"13730"},"pubchem_url":{"0":"http://pubchem.ncbi.nlm.nih.gov/compound/13730"},"source":{"0":"gnps///chebi///metacyc///hmdb"},"synonyms":{"0":"2'-deoxyadenosine"},"unique_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"username":{"0":"wjholtz"},"wikipedia_url":{"0":""},"label":{"0":"2'-deoxyadenosine"},"id_notes":{"0":"No description"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"bad match to ref"},"identification_notes":{"0":"my id note"},"rt_min":{"0":1.6964640054},"rt_max":{"0":2.6964640054},"rt_peak":{"0":2.1964640054},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"adduct":{"0":"[M+H]+"},"polarity":{"0":"positive"}}"""  # noqa:  E501
-    assert expected == dill2plots.export_atlas_to_spreadsheet(atlas).to_json().replace(
-        r"\/", "/"
-    )
+    assert expected == dill2plots.export_atlas_to_spreadsheet(atlas).to_json().replace(r"\/", "/")
+
+
+def test_filter_atlas01(atlas_df, metatlas_dataset):
+    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 1, 3e5)) == 1
+    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 1, 4e5)) == 0
+    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 80, 1e4)) == 1
+    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 90, 1e4)) == 0
+
+
+def test_strong_signal_compound_idxs(metatlas_dataset):
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 3e5) == [0]
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 4e5) == []
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 80, 1e4) == [0]
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 90, 1e4) == []
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 817bee24..f34db9be 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -15,17 +15,12 @@ def test_transfer_identification_data_to_atlas(metatlas_dataset, atlas):
     mod_atlas = atlas.clone(recursive=True)
     mod_atlas.compound_identifications[0].ms1_notes = "ms1_note to overwrite"
     mod_atlas.compound_identifications[0].ms2_notes = "ms2_note to overwrite"
-    mod_atlas.compound_identifications[
-        0
-    ].identification_notes = "identification_note to overwrite"
+    mod_atlas.compound_identifications[0].identification_notes = "identification_note to overwrite"
     out = gdhf.transfer_identification_data_to_atlas(metatlas_dataset, atlas)
     updated = atlas.compound_identifications[0]
     assert updated.ms1_notes == out.compound_identifications[0].ms1_notes
     assert updated.ms2_notes == out.compound_identifications[0].ms2_notes
-    assert (
-        updated.identification_notes
-        == out.compound_identifications[0].identification_notes
-    )
+    assert updated.identification_notes == out.compound_identifications[0].identification_notes
 
 
 def test_set_nested_term_attr(metatlas_dataset):
@@ -34,9 +29,7 @@ def test_set_nested_term_attr(metatlas_dataset):
         [0, 0, "identification", "mz_references", 0, "adduct"],
         "[M+NH4]+",
     )
-    assert (
-        metatlas_dataset[0][0]["identification"].mz_references[0].adduct == "[M+NH4]+"
-    )
+    assert metatlas_dataset[0][0]["identification"].mz_references[0].adduct == "[M+NH4]+"
 
 
 def test_set_nested_term_attr_tuple(metatlas_dataset):
@@ -45,9 +38,7 @@ def test_set_nested_term_attr_tuple(metatlas_dataset):
         [0, 0, "identification", "mz_references", 0, ("adduct",)],
         "[M+NH4]+",
     )
-    assert (
-        metatlas_dataset[0][0]["identification"].mz_references[0].adduct == "[M+NH4]+"
-    )
+    assert metatlas_dataset[0][0]["identification"].mz_references[0].adduct == "[M+NH4]+"
 
 
 def test_set_nested_term_list(metatlas_dataset):

From 1e3e51fc738fc6487f485bf62a42e266a4ac14a4 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 18 May 2021 21:40:45 -0700
Subject: [PATCH 002/177] update pre-commit-hooks repo

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fc1c52b1..1728c754 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.4.0
+  rev: v4.0.0
   hooks:
   - id: check-added-large-files
   - id: check-ast

From 17f6ac57fec046fc63e7bcb6938a0118390f1646 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 18 May 2021 21:41:15 -0700
Subject: [PATCH 003/177] Remove unused code

---
 metatlas/plots/dill2plots.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 355b26e1..e6223207 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -727,11 +727,6 @@ def update_rt_max(self, val):
     def update_rt_peak(self, val):
         self.update_rt('rt_peak', val)
 
-    def retrieve_compounds(self):
-        uids = [x['identification'].unique_id for x in self.data[0]]
-        compounds_list = metob.retrieve('CompoundIdentification', unique_id=uids, username='*')
-        return {c.unique_id: c for c in compounds_list}
-
     def get_similar_compounds(self, use_labels=True):
         """
         inputs:
@@ -3386,8 +3381,6 @@ def get_msms_plot_headers(data, hits, hit_ctr, compound_idx, compound, similar_c
         tuple of strings
             (mz_header, rt_header, cpd_header)
     """
-    avg_mz_measured = []
-    avg_rt_measured = []
     if not hits.empty:
         rt_ms2 = hits.index.get_level_values('msms_scan')[hit_ctr]
         mz_precursor = hits['measured_precursor_mz'].iloc[hit_ctr]

From c2bc3b1657dbfafbbad2fbdd1406f6875ba27d8b Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 18 May 2021 21:43:23 -0700
Subject: [PATCH 004/177] Add logging and fix warnings

---
 metatlas/io/metatlas_get_data_helper_fun.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 8a76d26d..47bf8a87 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from __future__ import print_function
+import logging
 import numpy as np
 import os.path
 import sys
@@ -14,6 +15,8 @@
 from six.moves import range
 from six.moves import zip
 
+logger = logging.getLogger(__name__)
+
 
 def create_msms_dataframe(df):
     """
@@ -172,11 +175,15 @@ def remove_ms1_data_not_in_atlas(atlas_df, data):
         has_current_polarity = atlas_df.detected_polarity == polarity
         if any(has_current_polarity):
             atlas_mz = atlas_df[has_current_polarity].mz.copy().sort_values().values
+            logger.debug("atlas_mz=%s", atlas_mz)
             max_mz_tolerance = atlas_df[has_current_polarity].mz_tolerance.max()
+            logger.debug("atlas_mz=%s, max_mz_tolerance=%.6f", atlas_mz, max_mz_tolerance)
             if data[name].shape[0] > 1:
                 original_mz = data[name].mz.values
                 nearest_mz = fast_nearest_interp(original_mz, atlas_mz, atlas_mz)
+                logger.debug("nearest_mz=%s", nearest_mz)
                 data[name]['ppm_difference'] = abs(original_mz - nearest_mz) / original_mz * 1e6
+                logger.debug("ppm_difference=%s", data[name]['ppm_difference'])
                 query_str = 'ppm_difference < %f' % max_mz_tolerance
                 data[name] = data[name].query(query_str)
     return data
@@ -305,7 +312,7 @@ def get_data_for_mzrt(row,data_df_pos,data_df_neg,extra_time = 0.5,use_mz = 'mz'
         if len(data_df_pos)>0:
             all_df = data_df_pos.query(ms1_query_str)
         else:
-            return pd.Series()
+            return pd.Series(dtype=np.float64)
     else:
         if len(data_df_neg)>0:
             all_df = data_df_neg.query(ms1_query_str)
@@ -841,9 +848,9 @@ def get_compound_names(data,use_labels=False):
         newstr = '%s_%s_%s_%s_%.4f_%.2f'%(str(i).zfill(4),_str,d['identification'].mz_references[0].detected_polarity,
                 d['identification'].mz_references[0].adduct,d['identification'].mz_references[0].mz,
                 d['identification'].rt_references[0].rt_peak)
-        newstr = re.sub('\.', 'p', newstr) #2 or more in regexp
+        newstr = re.sub(r'\.', 'p', newstr)  # 2 or more in regexp
 
-        newstr = re.sub('[\[\]]','',newstr)
+        newstr = re.sub(r'[\[\]]', '', newstr)
         newstr = re.sub('[^A-Za-z0-9+-]+', '_', newstr)
         newstr = re.sub('i_[A-Za-z]+_i_', '', newstr)
         if newstr[0] == '_':

From df44ffa9aee2bfca13ec190ed13b06abea4d72e6 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 18 May 2021 22:25:53 -0700
Subject: [PATCH 005/177] WIP - add tests and fixes to pass tests

---
 .gitignore                                    |   1 +
 metatlas/datastructures/metatlas_dataset.py   |  39 +-
 noxfile.py                                    |  63 ++-
 pyproject.toml                                |   3 +
 tests/{unit => }/conftest.py                  |   0
 tests/fixtures/__init__.py                    |   0
 tests/fixtures/database.py                    |  20 +
 tests/fixtures/hdf5_fixtures.py               | 410 ++++++++++++++
 tests/fixtures/metatlas_dataset_fixtures.py   |  46 +-
 tests/fixtures/metatlas_object_fixtures.py    | 514 +++++++++++++++++-
 tests/system/__init__.py                      |   0
 tests/system/test_targeted.py                 |   2 +
 tests/unit/test_dill2plot.py                  |  22 +-
 tests/unit/test_metatlas_dataset.py           | 331 ++++++++++-
 .../unit/test_metatlas_get_data_helper_fun.py |   7 +-
 15 files changed, 1377 insertions(+), 81 deletions(-)
 rename tests/{unit => }/conftest.py (100%)
 create mode 100644 tests/fixtures/__init__.py
 create mode 100644 tests/fixtures/database.py
 create mode 100644 tests/fixtures/hdf5_fixtures.py
 create mode 100644 tests/system/__init__.py

diff --git a/.gitignore b/.gitignore
index 97ac1326..cf68e21b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ htmlcov/
 .nox/
 .tox/
 .coverage
+.coverage.*
 .cache
 nosetests.xml
 coverage.xml
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 68465684..d8b57592 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -75,7 +75,7 @@ def _build(self):
                         self.extra_mz,
                     )
                 )
-        if self.max_cpus > 1:
+        if self.max_cpus > 1 and len(files) > 1:
             with multiprocessing.Pool(processes=min(self.max_cpus, len(files))) as pool:
                 samples = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
         else:  # skip multiprocessing as this makes for easier debugging
@@ -107,7 +107,9 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
         if (keep_idxs is None) == (remove_idxs is None):
             raise ValueError("Exactly one of keep_idxs and remove_idxs should be None")
         start_len = len(self.atlas_df)
-        keep_idxs = keep_idxs if remove_idxs is None else self.atlas_df.index.difference(remove_idxs)
+        if remove_idxs is not None:
+            _error_if_bad_idxs(self.atlas_df, remove_idxs)
+            keep_idxs = self.atlas_df.index.difference(remove_idxs)
         self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
         self._atlas_df_valid = True
         name = f"{self.atlas.name}_compound_filtered" if name is None else name
@@ -210,7 +212,7 @@ def __getitem__(self, idx):
 
     def __setitem__(self, idx, value):
         """assign value for sample at idx"""
-        self._data[idx] = value
+        self.data[idx] = value
 
     def _set_and_invalidate_properties(self, attribute_name, new_value, property_names):
         """
@@ -261,6 +263,8 @@ def atlas(self):
     @atlas.setter
     def atlas(self, atlas):
         """atlas setter, invalidate atlas_df and data"""
+        if not isinstance(atlas, metob.Atlas):
+            raise TypeError("Cannot set atlas to container a non-Atlas object")
         self._set_and_invalidate_properties("atlas", atlas, ["atlas_df", "data"])
 
     @property
@@ -275,8 +279,15 @@ def groups(self, groups):
 
     @property
     def polarity(self):
-        """polarity getter assumes all polarities within class are the same"""
-        return self.data[0][0]["identification"].mz_references[0].detected_polarity
+        """
+        polarity getter assumes all polarities within class are the same
+        returns 'positive' if there are no samples or no compound identifications
+        """
+        try:
+            cid = self.data[0][0]["identification"]
+        except IndexError:
+            return "positive"
+        return cid.mz_references[0].detected_polarity
 
     @property
     def extra_time(self):
@@ -356,8 +367,10 @@ def __len__(self):
 
     def set_data(self, ids, value):
         """update a value within self._data"""
+        if not self._data_valid:
+            self._build()
+            self._data_valid = True
         self._atlas_df_valid = False
-        self._data_valid = False
         _set_nested(self._data, ids, value)
 
     @property
@@ -381,9 +394,9 @@ def set_rt(self, compound_idx, which, time):
         assert which in ["rt_min", "rt_peak", "rt_max"]
         atlas_rt_ref = self.atlas.compound_identifications[compound_idx].rt_references[0]
         setattr(atlas_rt_ref, which, time)
-        data_rt_ref = self._data[0][compound_idx]["identification"].rt_references[0]
+        data_rt_ref = self.data[0][compound_idx]["identification"].rt_references[0]
         setattr(data_rt_ref, which, time)
-        self._atlas_df.loc[compound_idx, which] = time
+        self.atlas_df.loc[compound_idx, which] = time
 
     def set_note(self, compound_idx, which, value):
         """
@@ -396,9 +409,9 @@ def set_note(self, compound_idx, which, value):
         assert which in ["ms1_notes", "ms2_notes", "identification_notes"]
         atlas_cid = self.atlas.compound_identifications[compound_idx]
         setattr(atlas_cid, which, value)
-        data_cid = self._data[0][compound_idx]["identification"]
+        data_cid = self.data[0][compound_idx]["identification"]
         setattr(data_cid, which, value)
-        self._atlas_df.loc[compound_idx, which] = value
+        self.atlas_df.loc[compound_idx, which] = value
 
     def compound_indices_marked_remove(self):
         """
@@ -476,3 +489,9 @@ def _set_nested(data, ids, value):
             _set_nested(getattr(data, ids[0]), ids[1:], value)
         else:
             _set_nested(data[ids[0]], ids[1:], value)
+
+
+def _error_if_bad_idxs(dataframe, test_idx_list):
+    bad = set(test_idx_list) - set(dataframe.index)
+    if len(bad) > 0:
+        raise IndexError(f"Invalid index values: {bad}.")
diff --git a/noxfile.py b/noxfile.py
index 19d55e18..f4a5c3fe 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -14,7 +14,6 @@
         "update_git_hooks",
         ]
 
-
 # files we can run all the checks on, as they don't contain legacy code that
 # has not yet been updated to pass all checks.
 more_checks = [
@@ -23,6 +22,9 @@
     'tests'
     ]
 
+notebooks = [
+    'notebooks/reference/Targeted.ipynb',
+    ]
 
 pytest_deps = [
         'attrs==21.2.0',
@@ -34,37 +36,45 @@
         'pyparsing==2.4.7',
         'pytest==6.2.4',
         'pytest-cov==2.11.1',
+        'pytest-mock==3.6.1',
         'toml==0.10.2',
         ]
 
+nbqa_deps = [
+        'nbqa==0.8.1',
+        'tokenize-rt==4.1.0',
+        'importlib-metadata==4.0.1',
+        'astroid==2.5.6',
+        'wrapt==1.12.1',
+        'lazy_object_proxy==1.6.0',
+        'isort==5.8.0',
+        ]
+
+flake8_deps = [
+    'flake8',
+    'flake8-bugbear',
+    'flake8-builtins',
+    'flake8-comprehensions',
+    ]
 
 nox.options.error_on_external_run = True
 
 
 @nox.session(python=py_versions[0])
 def flake8_diff(session):
-    session.install('flake8',
-                    'flake8-bugbear',
-                    'flake8-builtins',
-                    'flake8-comprehensions')
+    session.install(*flake8_deps)
     session.run('sh', '-c', 'git diff -U0 -w --staged HEAD | flake8 --diff', external=True)
 
 
 @nox.session(python=py_versions[0])
 def flake8_all(session):
-    session.install('flake8',
-                    'flake8-bugbear',
-                    'flake8-builtins',
-                    'flake8-comprehensions')
+    session.install(*flake8_deps)
     session.run('flake8', 'metatlas', 'tests')
 
 
 @nox.session(python=py_versions[0])
 def flake8(session):
-    session.install('flake8',
-                    'flake8-bugbear',
-                    'flake8-builtins',
-                    'flake8-comprehensions')
+    session.install(*flake8_deps)
     session.run('flake8', *more_checks)
 
 
@@ -95,6 +105,33 @@ def pylint(session):
     session.run('pylint', *more_checks)
 
 
+@nox.session(venv_backend='conda', python=py_versions, reuse_venv=True)
+def pylint_nb(session):
+    session.run('conda', 'env', 'update', '--prefix', session.virtualenv.location,
+                '--file', 'docker/metatlas_env.yaml', silent=True)
+    session.install('--no-deps', *nbqa_deps, 'pylint')
+    session.run('nbqa', 'pylint', *notebooks)
+
+
+@nox.session(python=py_versions[0])
+def flake8_nb(session):
+    session.install(*nbqa_deps, *flake8_deps)
+    session.run('nbqa', 'flake8', *notebooks)
+
+
+@nox.session(python=py_versions[0])
+def black_nb(session):
+    session.install('black', *nbqa_deps)
+    session.run('nbqa', 'black', '--check', *notebooks)
+
+
+@nox.session(python=py_versions[0])
+def blacken_nb(session):
+    """this modifies notebook files to meet black's requirements"""
+    session.install('black', *nbqa_deps)
+    session.run('nbqa', 'black', '--nbqa-mutate', *notebooks)
+
+
 @nox.session(venv_backend='conda', python=py_versions, reuse_venv=True)
 def unit_tests(session):
     session.run('conda', 'env', 'update', '--prefix', session.virtualenv.location,
diff --git a/pyproject.toml b/pyproject.toml
index 2bfe6b51..ab5095d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,6 @@ disable = "C0330, C0326"
 
 [tool.pylint.format]
 max-line-length = "110"
+
+[tool.pytest.ini_options]
+filterwarnings = "ignore::DeprecationWarning:dataset.*:"
diff --git a/tests/unit/conftest.py b/tests/conftest.py
similarity index 100%
rename from tests/unit/conftest.py
rename to tests/conftest.py
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/fixtures/database.py b/tests/fixtures/database.py
new file mode 100644
index 00000000..d4c26d7e
--- /dev/null
+++ b/tests/fixtures/database.py
@@ -0,0 +1,20 @@
+# /pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
+
+import getpass
+import os
+import pytest
+import sqlite3
+from metatlas.datastructures import metatlas_objects as metob
+
+
+@pytest.fixture(name="sqlite")
+def fixture_sqlite(tmp_path):
+    # make sure we don't accidently pollute the production MySQL DB
+    assert os.environ.get("METATLAS_LOCAL") == "TRUE"
+    os.chdir(tmp_path)  # don't reuse the sqlite DB
+    username = getpass.getuser()
+    sqlite3.connect(f"{username}_workspace.db").close()
+    dummy = metob.Atlas()
+    dummy.name = "this is a dummy atlas to initialize sqlite db"
+    metob.store(dummy)
+    # do I need to store each type of object?
diff --git a/tests/fixtures/hdf5_fixtures.py b/tests/fixtures/hdf5_fixtures.py
new file mode 100644
index 00000000..cfbb6d36
--- /dev/null
+++ b/tests/fixtures/hdf5_fixtures.py
@@ -0,0 +1,410 @@
+# pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
+
+import pytest
+import pandas as pd
+
+
+@pytest.fixture(name="ms1_pos")
+def fixture_ms1_pos():
+    return pd.DataFrame(
+        data={
+            "mz": {
+                "0": 252.1089324951,
+                "1": 252.1090087891,
+                "2": 252.1088104248,
+                "3": 252.1090087891,
+                "4": 252.10887146,
+                "5": 252.1089324951,
+                "6": 252.1089324951,
+                "7": 252.1088256836,
+                "8": 252.1088867188,
+                "9": 252.1090393066,
+                "10": 252.1089782715,
+                "11": 252.1089630127,
+                "12": 252.1089630127,
+                "13": 252.1089782715,
+                "14": 252.1090240479,
+                "15": 252.1089782715,
+                "16": 252.1090240479,
+                "17": 252.1089324951,
+                "18": 252.1090393066,
+                "19": 252.1088867188,
+                "20": 252.10887146,
+                "21": 252.1089324951,
+                "22": 252.1089630127,
+                "23": 252.1089935303,
+                "24": 252.1089172363,
+                "25": 252.1089477539,
+                "26": 252.1090545654,
+                "27": 252.1089630127,
+                "28": 252.1090240479,
+                "29": 252.1090087891,
+                "30": 252.1090393066,
+                "31": 252.1090240479,
+                "32": 252.1089935303,
+                "33": 252.1090240479,
+                "34": 252.1089630127,
+                "35": 252.1090087891,
+                "36": 252.1090240479,
+                "37": 252.1089172363,
+                "38": 252.1089019775,
+                "39": 252.1089477539,
+                "40": 252.1089324951,
+                "41": 252.1089477539,
+                "42": 252.1089477539,
+                "43": 252.1089477539,
+                "44": 252.1089782715,
+                "45": 252.1088867188,
+                "46": 252.1089172363,
+                "47": 252.1089324951,
+                "48": 252.1089782715,
+                "49": 252.1089477539,
+                "50": 252.1089172363,
+                "51": 252.1089324951,
+                "52": 252.1089630127,
+                "53": 252.1088867188,
+                "54": 252.1089630127,
+                "55": 252.1085205078,
+                "56": 252.1090545654,
+                "57": 252.1089935303,
+                "58": 252.1088104248,
+                "59": 252.1086578369,
+                "60": 252.1089935303,
+                "61": 252.1085510254,
+                "62": 252.1082763672,
+                "63": 252.1082458496,
+                "64": 252.1084136963,
+                "65": 252.1092224121,
+                "66": 252.1091766357,
+                "67": 252.1092834473,
+                "68": 252.1087493896,
+                "69": 252.1112518311,
+                "70": 252.1088409424,
+                "71": 252.1086425781,
+                "72": 252.1091766357,
+                "73": 252.1094055176,
+            },
+            "i": {
+                "0": 312203.5,
+                "1": 387914.59375,
+                "2": 308308.5,
+                "3": 334653.59375,
+                "4": 339521.625,
+                "5": 345527.21875,
+                "6": 292437.34375,
+                "7": 413614.53125,
+                "8": 300285.28125,
+                "9": 383848.71875,
+                "10": 404313.21875,
+                "11": 377231.34375,
+                "12": 453965.5625,
+                "13": 431327.0,
+                "14": 523180.0625,
+                "15": 510239.8125,
+                "16": 631459.1875,
+                "17": 807419.5,
+                "18": 842647.5625,
+                "19": 1053031.625,
+                "20": 1082361.625,
+                "21": 1198966.625,
+                "22": 1109162.375,
+                "23": 1126347.125,
+                "24": 1373071.5,
+                "25": 1589018.375,
+                "26": 1281309.875,
+                "27": 1660166.75,
+                "28": 1492912.25,
+                "29": 2029801.5,
+                "30": 2029874.125,
+                "31": 2035966.625,
+                "32": 2010867.875,
+                "33": 2036981.375,
+                "34": 2148879.25,
+                "35": 2359861.25,
+                "36": 2054066.125,
+                "37": 1691976.0,
+                "38": 1778159.125,
+                "39": 1776166.125,
+                "40": 1752154.125,
+                "41": 1575676.875,
+                "42": 1199910.625,
+                "43": 1259708.25,
+                "44": 1087384.375,
+                "45": 826077.125,
+                "46": 802296.875,
+                "47": 547785.125,
+                "48": 545340.0625,
+                "49": 584624.4375,
+                "50": 468524.8125,
+                "51": 305931.1875,
+                "52": 330310.34375,
+                "53": 309740.625,
+                "54": 289212.71875,
+                "55": 230440.9375,
+                "56": 210549.390625,
+                "57": 169972.390625,
+                "58": 140521.234375,
+                "59": 116637.953125,
+                "60": 117197.625,
+                "61": 84652.1171875,
+                "62": 117615.578125,
+                "63": 103500.921875,
+                "64": 89320.9453125,
+                "65": 76313.9296875,
+                "66": 55575.00390625,
+                "67": 76784.6796875,
+                "68": 28829.162109375,
+                "69": 26051.6171875,
+                "70": 42957.18359375,
+                "71": 50342.6953125,
+                "72": 37611.33984375,
+                "73": 38202.83203125,
+            },
+            "rt": {
+                "0": 2.1030805111,
+                "1": 2.1084616184,
+                "2": 2.1139531136,
+                "3": 2.1193552017,
+                "4": 2.1248509884,
+                "5": 2.1302509308,
+                "6": 2.135682106,
+                "7": 2.1411821842,
+                "8": 2.1459801197,
+                "9": 2.1513926983,
+                "10": 2.1568279266,
+                "11": 2.1622362137,
+                "12": 2.1676549911,
+                "13": 2.1730883121,
+                "14": 2.179015398,
+                "15": 2.1845297813,
+                "16": 2.1900422573,
+                "17": 2.1949694157,
+                "18": 2.20002985,
+                "19": 2.2055358887,
+                "20": 2.2110378742,
+                "21": 2.2165191174,
+                "22": 2.2219588757,
+                "23": 2.2273921967,
+                "24": 2.2328462601,
+                "25": 2.2382712364,
+                "26": 2.2437169552,
+                "27": 2.2492566109,
+                "28": 2.2547125816,
+                "29": 2.2601687908,
+                "30": 2.2656960487,
+                "31": 2.2704958916,
+                "32": 2.2758042812,
+                "33": 2.2813498974,
+                "34": 2.2868082523,
+                "35": 2.2922415733,
+                "36": 2.2976748943,
+                "37": 2.3031060696,
+                "38": 2.308131218,
+                "39": 2.313628912,
+                "40": 2.3185498714,
+                "41": 2.3239560127,
+                "42": 2.3293914795,
+                "43": 2.3349123001,
+                "44": 2.3403663635,
+                "45": 2.346799612,
+                "46": 2.3522267342,
+                "47": 2.3576600552,
+                "48": 2.3631224632,
+                "49": 2.3685662746,
+                "50": 2.3740911484,
+                "51": 2.3794057369,
+                "52": 2.3848536015,
+                "53": 2.3903660774,
+                "54": 2.3953785896,
+                "55": 2.4006638527,
+                "56": 2.4062638283,
+                "57": 2.411709547,
+                "58": 2.4171659946,
+                "59": 2.4226117134,
+                "60": 2.4302260876,
+                "61": 2.4357616901,
+                "62": 2.4407405853,
+                "63": 2.4461927414,
+                "64": 2.451615572,
+                "65": 2.4571509361,
+                "66": 2.4627010822,
+                "67": 2.4681572914,
+                "68": 2.4735822678,
+                "69": 2.4735822678,
+                "70": 2.4787945747,
+                "71": 2.4842174053,
+                "72": 2.4896612167,
+                "73": 2.495146513,
+            },
+            "polarity": {
+                "0": 1,
+                "1": 1,
+                "2": 1,
+                "3": 1,
+                "4": 1,
+                "5": 1,
+                "6": 1,
+                "7": 1,
+                "8": 1,
+                "9": 1,
+                "10": 1,
+                "11": 1,
+                "12": 1,
+                "13": 1,
+                "14": 1,
+                "15": 1,
+                "16": 1,
+                "17": 1,
+                "18": 1,
+                "19": 1,
+                "20": 1,
+                "21": 1,
+                "22": 1,
+                "23": 1,
+                "24": 1,
+                "25": 1,
+                "26": 1,
+                "27": 1,
+                "28": 1,
+                "29": 1,
+                "30": 1,
+                "31": 1,
+                "32": 1,
+                "33": 1,
+                "34": 1,
+                "35": 1,
+                "36": 1,
+                "37": 1,
+                "38": 1,
+                "39": 1,
+                "40": 1,
+                "41": 1,
+                "42": 1,
+                "43": 1,
+                "44": 1,
+                "45": 1,
+                "46": 1,
+                "47": 1,
+                "48": 1,
+                "49": 1,
+                "50": 1,
+                "51": 1,
+                "52": 1,
+                "53": 1,
+                "54": 1,
+                "55": 1,
+                "56": 1,
+                "57": 1,
+                "58": 1,
+                "59": 1,
+                "60": 1,
+                "61": 1,
+                "62": 1,
+                "63": 1,
+                "64": 1,
+                "65": 1,
+                "66": 1,
+                "67": 1,
+                "68": 1,
+                "69": 1,
+                "70": 1,
+                "71": 1,
+                "72": 1,
+                "73": 1,
+            },
+        }
+    )
+
+
+@pytest.fixture(name="ms2_pos")
+def fixture_ms2_pos():
+    return pd.DataFrame(
+        data={
+            "mz": {
+                "0": 252.1081695557,
+                "1": 252.1564941406,
+                "2": 252.1087036133,
+                "3": 252.1572875977,
+                "4": 252.1089019775,
+                "5": 252.1550292969,
+                "6": 252.1090698242,
+                "7": 252.1557617188,
+            },
+            "i": {
+                "0": 32103.3515625,
+                "1": 6470.0009765625,
+                "2": 93112.0859375,
+                "3": 7624.11328125,
+                "4": 131062.0,
+                "5": 6535.4560546875,
+                "6": 76976.7265625,
+                "7": 6090.6440429688,
+            },
+            "rt": {
+                "0": 2.0097544193,
+                "1": 2.0097544193,
+                "2": 2.2203779221,
+                "3": 2.2203779221,
+                "4": 2.327804327,
+                "5": 2.327804327,
+                "6": 2.3452186584,
+                "7": 2.3452186584,
+            },
+            "polarity": {"0": 1, "1": 1, "2": 1, "3": 1, "4": 1, "5": 1, "6": 1, "7": 1},
+            "precursor_MZ": {
+                "0": 252.0195159912,
+                "1": 252.0195159912,
+                "2": 252.10887146,
+                "3": 252.10887146,
+                "4": 252.0194854736,
+                "5": 252.0194854736,
+                "6": 252.1089477539,
+                "7": 252.1089477539,
+            },
+            "precursor_intensity": {
+                "0": 2748235.5,
+                "1": 2748235.5,
+                "2": 2872807.5,
+                "3": 2872807.5,
+                "4": 3536752.25,
+                "5": 3536752.25,
+                "6": 3046732.75,
+                "7": 3046732.75,
+            },
+            "collision_energy": {
+                "0": 23.3333339691,
+                "1": 23.3333339691,
+                "2": 23.3333339691,
+                "3": 23.3333339691,
+                "4": 23.3333339691,
+                "5": 23.3333339691,
+                "6": 23.3333339691,
+                "7": 23.3333339691,
+            },
+        }
+    )
+
+
+@pytest.fixture(name="ms1_neg_empty")
+def fixture_ms1_neg_empty():
+    return pd.DataFrame(data={"mz": {}, "i": {}, "rt": {}, "polarity": {}})
+
+
+@pytest.fixture(name="ms2_neg_empty")
+def fixture_ms2_neg_empty():
+    return pd.DataFrame(
+        data={
+            "mz": {},
+            "i": {},
+            "rt": {},
+            "polarity": {},
+            "precursor_MZ": {},
+            "precursor_intensity": {},
+            "collision_energy": {},
+        }
+    )
+
+
+@pytest.fixture
+def df_container(ms1_pos, ms2_pos, ms1_neg_empty, ms2_neg_empty):
+    return {"ms1_neg": ms1_neg_empty, "ms1_pos": ms1_pos, "ms2_neg": ms2_neg_empty, "ms2_pos": ms2_pos}
diff --git a/tests/fixtures/metatlas_dataset_fixtures.py b/tests/fixtures/metatlas_dataset_fixtures.py
index 8d228141..b8e5c5b6 100644
--- a/tests/fixtures/metatlas_dataset_fixtures.py
+++ b/tests/fixtures/metatlas_dataset_fixtures.py
@@ -1,11 +1,13 @@
+# pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring, too-many-arguments
+
 import pytest
 import numpy as np
 
 from metatlas.datastructures import metatlas_dataset as mads
 
 
-@pytest.fixture
-def ms1_summary():
+@pytest.fixture(name="ms1_summary")
+def fixture_ms1_summary():
     return {
         "num_ms1_datapoints": 85.0,
         "mz_peak": 252.1092987060547,
@@ -17,8 +19,8 @@ def ms1_summary():
     }
 
 
-@pytest.fixture
-def msms():
+@pytest.fixture(name="msms")
+def fixture_msms():
     return {
         "data": {
             "mz": np.array([], dtype=np.float64),
@@ -32,24 +34,24 @@ def msms():
     }
 
 
-@pytest.fixture
-def metatlas_dataset(lcmsrun, group, compound_identification, eic, ms1_summary, msms):
-    return [
-        [
-            {
-                "atlas_name": "my_atlas",
-                "atlas_unique_id": "c3de0a6f-a49d-47ea-9c39-edd29c1cf9bb",
-                "lcmsrun": lcmsrun,
-                "group": group,
-                "identification": compound_identification,
-                "data": {"eic": eic, "ms1_summary": ms1_summary, "msms": msms},
-            }
-        ]
-    ]
+@pytest.fixture(name="metatlas_dataset")
+def fixture_metatlas_dataset(mocker, df_container, atlas, group):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    return mads.MetatlasDataset(atlas, [group])
+
+
+@pytest.fixture(name="metatlas_dataset_with_2_cids")
+def fixture_metatlas_dataset_with_2_cids(mocker, df_container, atlas_with_2_cids, group):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    return mads.MetatlasDataset(atlas_with_2_cids, [group])
 
 
-@pytest.fixture
-def eic():
+@pytest.fixture(name="eic")
+def fixture_eic():
     return {
         "mz": [
             252.1089324951172,
@@ -318,6 +320,6 @@ def eic():
     }
 
 
-@pytest.fixture
-def atlas_df(atlas, group):
+@pytest.fixture(name="atlas_df")
+def fixture_atlas_df(atlas, group):
     return mads.MetatlasDataset(atlas, [group]).atlas_df
diff --git a/tests/fixtures/metatlas_object_fixtures.py b/tests/fixtures/metatlas_object_fixtures.py
index 86128b0a..81945355 100644
--- a/tests/fixtures/metatlas_object_fixtures.py
+++ b/tests/fixtures/metatlas_object_fixtures.py
@@ -1,9 +1,12 @@
-from metatlas.datastructures import metatlas_objects as metob
+# /pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
+
+import pandas as pd
 import pytest
+from metatlas.datastructures import metatlas_objects as metob
 
 
-@pytest.fixture
-def compound():
+@pytest.fixture(name="compound")
+def fixture_compound():
     compound = metob.Compound()
     compound.unique_id = "60cd6743e56545c6a6cb066ec3553450"
     compound.mono_isotopic_molecular_weight = 251.101839276
@@ -43,8 +46,8 @@ def compound():
     return compound
 
 
-@pytest.fixture
-def rt_reference():
+@pytest.fixture(name="rt_reference")
+def fixture_rt_reference():
     rt_ref = metob.RtReference()
     rt_ref.unique_id = "a845ddfdf8ef4713bcef3bdb84999030"
     rt_ref.username = "wjholtz"
@@ -64,8 +67,8 @@ def rt_reference():
     return rt_ref
 
 
-@pytest.fixture
-def mz_reference():
+@pytest.fixture(name="mz_reference")
+def fixture_mz_reference():
     mz_ref = metob.MzReference()
     mz_ref.unique_id = "eb6d03c9ef574051b92dad7b2fc259a2"
     mz_ref.username = "wjholtz"
@@ -88,8 +91,8 @@ def mz_reference():
     return mz_ref
 
 
-@pytest.fixture
-def compound_identification(compound, rt_reference, mz_reference):
+@pytest.fixture(name="compound_identification")
+def fixture_compound_identification(compound, rt_reference, mz_reference):
     ident = metob.CompoundIdentification()
     ident.unique_id = "18737c7141cc4efaa4545bead13ac751"
     ident.username = "wjholtz"
@@ -97,7 +100,6 @@ def compound_identification(compound, rt_reference, mz_reference):
     ident.creation_time = 1613002849
     ident.last_modified = 1613002979
     ident.identification_grade = None
-    ident.compound = [compound]
     ident.prev_uid = "origin"
     ident.name = "2'-deoxyadenosine"
     ident.head_id = "18737c7141cc4efaa4545bead13ac751"
@@ -109,30 +111,145 @@ def compound_identification(compound, rt_reference, mz_reference):
     ident.ms1_notes = "keep"
     ident.frag_references = []
     ident.intensity_references = []
+    ident.compound = [compound]
     ident.mz_references = [mz_reference]
     ident.rt_references = [rt_reference]
     return ident
 
 
-@pytest.fixture
-def atlas(compound_identification):
+@pytest.fixture(name="atlas")
+def fixture_atlas(compound_identification):
     small_atlas = metob.Atlas()
     small_atlas.compound_identifications = [compound_identification]
     return small_atlas
 
 
-@pytest.fixture
-def atlas_two_compounds(compound_identification):
+@pytest.fixture(name="compound_2")
+def fixture_compound_2():
+    compound = metob.Compound()
+    compound.chebi_id = "CHEBI:16335"
+    compound.chebi_url = "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:16335"
+    compound.creation_time = 1466212384
+    compound.description = "A ribonucleoside composed of a molecule of adenine attached to a ribofuranose moiety via a beta1N9-glycosidic bond."
+    compound.formula = "C10H13N5O4"
+    compound.head_id = "1ad02275f47b4033a451e99874f4764f"
+    compound.hmdb_id = "HMDB00050"
+    compound.hmdb_url = "http://www.hmdb.ca/metabolites/HMDB00050"
+    compound.img_abc_id = ""
+    compound.inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
+    compound.inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
+    compound.iupac_name = ""
+    compound.kegg_id = "C00212"
+    compound.kegg_url = "http://www.genome.jp/dbget-bin/www_bget?C00212"
+    compound.last_modified = 1612996604
+    compound.lipidmaps_id = ""
+    compound.lipidmaps_url = ""
+    compound.metacyc_id = "ADENOSINE"
+    compound.mono_isotopic_molecular_weight = 267.096753896
+    compound.name = "adenosine"
+    compound.neutralized_2d_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)"
+    compound.neutralized_2d_inchi_key = "OIRDTQYFTABQOQ-UHFFFAOYSA-N"
+    compound.neutralized_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
+    compound.neutralized_inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
+    compound.num_free_radicals = 0
+    compound.number_components = 1
+    compound.permanent_charge = 0
+    compound.prev_uid = "origin"
+    compound.pubchem_compound_id = "60961"
+    compound.pubchem_url = "http://pubchem.ncbi.nlm.nih.gov/compound/60961"
+    compound.source = "chebi///wikidata///metacyc///gnps///hmdb"
+    compound.synonyms = "adenosine///58-61-7///Adenocard///Adenoscan"  # this value was pruned down
+    compound.unique_id = "1ad02275f47b4033a451e99874f4764f"
+    compound.username = "wjholtz"
+    compound.wikipedia_url = ""
+    return compound
+
+
+@pytest.fixture(name="rt_reference_2")
+def fixture_rt_reference_2():
+    rt_ref = metob.RtReference()
+    rt_ref.creation_time = 1613002857
+    rt_ref.description = "No description"
+    rt_ref.enabled = True
+    rt_ref.head_id = "f74622bcef924f5390ba6e127633e731"
+    rt_ref.last_modified = 1613002980
+    rt_ref.lcms_run = None
+    rt_ref.name = "Untitled"
+    rt_ref.prev_uid = "origin"
+    rt_ref.ref_type = ""
+    rt_ref.rt_max = 3.5233184079926665
+    rt_ref.rt_min = 2.5233184079926665
+    rt_ref.rt_peak = 3.0233184079926665
+    rt_ref.rt_units = "min"
+    rt_ref.unique_id = "f74622bcef924f5390ba6e127633e731"
+    rt_ref.username = "wjholtz"
+    return rt_ref
+
+
+@pytest.fixture(name="mz_reference_2")
+def fixture_mz_reference_2():
+    mz_ref = metob.MzReference()
+    mz_ref.adduct = "[M+H]+"
+    mz_ref.creation_time = 1613002857
+    mz_ref.description = "No description"
+    mz_ref.detected_polarity = "positive"
+    mz_ref.enabled = True
+    mz_ref.head_id = "b0e3cf0df44a4079be7908c6b525d3ac"
+    mz_ref.last_modified = 1613002980
+    mz_ref.lcms_run = None
+    mz_ref.modification = ""
+    mz_ref.mz = 268.1040539
+    mz_ref.mz_tolerance = 20.0
+    mz_ref.mz_tolerance_units = "ppm"
+    mz_ref.name = "Untitled"
+    mz_ref.observed_formula = ""
+    mz_ref.prev_uid = "origin"
+    mz_ref.ref_type = ""
+    mz_ref.unique_id = "b0e3cf0df44a4079be7908c6b525d3ac"
+    mz_ref.username = "wjholtz"
+    return mz_ref
+
+
+@pytest.fixture(name="compound_identification_2")
+def fixture_compound_identification_2(compound_2, rt_reference_2, mz_reference_2):
+    ident = metob.CompoundIdentification()
+    ident.creation_time = 1613002856
+    ident.description = "No description"
+    ident.do_normalization = False
+    ident.frag_references = []
+    ident.head_id = "6cca7aa44c0e4a109f695ba980d69472"
+    ident.identification_grade = None
+    ident.identification_notes = ""
+    ident.intensity_references = []
+    ident.internal_standard_id = ""
+    ident.internal_standard_to_use = ""
+    ident.last_modified = 1613002980
+    ident.ms1_notes = ""
+    ident.ms2_notes = ""
+    ident.name = "adenosine"
+    ident.prev_uid = "origin"
+    ident.unique_id = "6cca7aa44c0e4a109f695ba980d69472"
+    ident.username = "wjholtz"
+    ident.frag_references = []
+    ident.intensity_references = []
+    ident.compound = [compound_2]
+    ident.mz_references = [mz_reference_2]
+    ident.rt_references = [rt_reference_2]
+    return ident
+
+
+@pytest.fixture(name="atlas_with_2_cids")
+def fixture_atlas_with_2_cids(compound_identification, compound_identification_2):
     small_atlas = metob.Atlas()
     small_atlas.compound_identifications = [
         compound_identification,
-        compound_identification,
+        compound_identification_2,
     ]
     return small_atlas
 
 
-@pytest.fixture
-def lcmsrun():
+@pytest.fixture(name="lcmsrun")
+def fixture_lcmsrun():
     run = metob.LcmsRun()
     run.unique_id = "7ce51039cfca4426b4e51999ac45d018"
     run.username = "root"
@@ -154,8 +271,8 @@ def lcmsrun():
     return run
 
 
-@pytest.fixture
-def group(lcmsrun):
+@pytest.fixture(name="group")
+def fixture_group(lcmsrun):
     grp = metob.Group()
     grp.items = [lcmsrun]
     grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
@@ -168,3 +285,362 @@ def group(lcmsrun):
     grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
     grp.short_name = "POS_Cone-S1"
     return grp
+
+
+@pytest.fixture(name="group_with_2_lcmsruns")
+def fixture_group_with_2_lcmsruns(lcmsrun):
+    grp = metob.Group()
+    grp.items = [lcmsrun, lcmsrun]
+    grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.username = "root"
+    grp.description = "No description"
+    grp.creation_time = 1620146477
+    grp.last_modified = 1620146477
+    grp.prev_uid = "origin"
+    grp.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1"
+    grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.short_name = "POS_Cone-S1"
+    return grp
+
+
+@pytest.fixture(name="hits")
+def fixture_hits():
+    hits_plus = pd.DataFrame(
+        data={
+            "score": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 0.7253785748,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 0.8688691781,
+            },
+            "num_matches": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 6,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 7,
+            },
+            "msv_query_aligned": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    [
+                        None,
+                        None,
+                        None,
+                        None,
+                        56.7212257385,
+                        59.0436058044,
+                        71.0422821045,
+                        73.0214157104,
+                        None,
+                        89.1910018921,
+                        99.0413742065,
+                        104.3592529297,
+                        104.3681869507,
+                        117.0548171997,
+                        None,
+                        118.9432754517,
+                        136.0619506836,
+                        None,
+                        None,
+                        None,
+                        145.9665527344,
+                        163.9772491455,
+                        169.9678497314,
+                        177.1133270264,
+                        187.9771575928,
+                        205.9878387451,
+                        210.9933166504,
+                        229.0038452148,
+                        252.0215606689,
+                        252.1087036133,
+                        252.1572875977,
+                        252.2064666748,
+                    ],
+                    [
+                        None,
+                        None,
+                        None,
+                        None,
+                        3361.7712402344,
+                        6589.943359375,
+                        6501.9853515625,
+                        4987.177734375,
+                        None,
+                        3257.0708007812,
+                        13393.138671875,
+                        3280.0544433594,
+                        4276.0112304688,
+                        57809.1875,
+                        None,
+                        4965.7436523438,
+                        648640.5625,
+                        None,
+                        None,
+                        None,
+                        11511.76171875,
+                        10362.68359375,
+                        5714.70703125,
+                        9354.2353515625,
+                        73409.0078125,
+                        257685.234375,
+                        53554.28125,
+                        193491.515625,
+                        5038.1469726562,
+                        93112.0859375,
+                        7624.11328125,
+                        4599.4125976562,
+                    ],
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    [
+                        None,
+                        50.2002449036,
+                        55.0126533508,
+                        57.0280647278,
+                        None,
+                        None,
+                        None,
+                        None,
+                        68.2973327637,
+                        69.0266494751,
+                        73.0213851929,
+                        None,
+                        74.6972732544,
+                        80.862159729,
+                        82.4692306519,
+                        85.0231246948,
+                        87.0394363403,
+                        92.4544296265,
+                        92.4610061646,
+                        104.3785171509,
+                        115.0390701294,
+                        126.1923675537,
+                        133.0496368408,
+                        136.0618743896,
+                        None,
+                        None,
+                        None,
+                        None,
+                        144.5760345459,
+                        181.1904449463,
+                        230.6756896973,
+                        268.1039733887,
+                    ],
+                    [
+                        None,
+                        87283.4296875,
+                        105163.625,
+                        246350.078125,
+                        None,
+                        None,
+                        None,
+                        None,
+                        81607.3046875,
+                        107886.640625,
+                        150512.90625,
+                        None,
+                        99324.7109375,
+                        80050.4375,
+                        108701.53125,
+                        278198.71875,
+                        95401.265625,
+                        92632.890625,
+                        111341.5625,
+                        119245.7734375,
+                        170358.671875,
+                        103961.4296875,
+                        226297.9375,
+                        48576460.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        98098.609375,
+                        100016.9296875,
+                        119618.1015625,
+                        16002674.0,
+                    ],
+                ],
+            },
+            "msv_ref_aligned": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    [
+                        57.0345,
+                        63.3177,
+                        63.3205,
+                        69.0344,
+                        None,
+                        None,
+                        71.0499,
+                        73.0292,
+                        84.9778,
+                        None,
+                        99.0447,
+                        None,
+                        None,
+                        117.055,
+                        118.059,
+                        None,
+                        136.062,
+                        137.066,
+                        236.709,
+                        253.112,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        252.109,
+                        None,
+                        None,
+                    ],
+                    [
+                        176328.0,
+                        328818.0,
+                        274432.0,
+                        197637.0,
+                        None,
+                        None,
+                        896360.0,
+                        1192020.0,
+                        378547.0,
+                        None,
+                        3921880.0,
+                        None,
+                        None,
+                        15737700.0,
+                        266131.0,
+                        None,
+                        144220000.0,
+                        3455270.0,
+                        185227.0,
+                        1284450.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        20960800.0,
+                        None,
+                        None,
+                    ],
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    [
+                        56.7603,
+                        None,
+                        None,
+                        57.0346,
+                        61.0292,
+                        61.8182,
+                        64.9491,
+                        67.9255,
+                        None,
+                        None,
+                        73.0292,
+                        82.0663,
+                        None,
+                        None,
+                        None,
+                        85.0293,
+                        None,
+                        None,
+                        None,
+                        None,
+                        115.04,
+                        None,
+                        133.05,
+                        136.062,
+                        137.067,
+                        183.555,
+                        230.198,
+                        269.108,
+                        None,
+                        None,
+                        None,
+                        268.105,
+                    ],
+                    [
+                        35523.7,
+                        None,
+                        None,
+                        184839.0,
+                        43216.2,
+                        40066.3,
+                        40362.0,
+                        41550.6,
+                        None,
+                        None,
+                        93791.1,
+                        293258.0,
+                        None,
+                        None,
+                        None,
+                        202756.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        184050.0,
+                        None,
+                        364543.0,
+                        29646700.0,
+                        830130.0,
+                        51455.4,
+                        51206.7,
+                        970064.0,
+                        None,
+                        None,
+                        None,
+                        12412800.0,
+                    ],
+                ],
+            },
+            "name": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "2'-deoxyadenosine",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "adenosine",
+            },
+            "adduct": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "[M+H]+",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "[M+H]+",
+            },
+            "inchi_key": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "OIRDTQYFTABQOQ-KQYNXXCUSA-N",
+            },
+            "precursor_mz": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.1091393,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.1040539,
+            },
+            "measured_precursor_mz": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.10887146,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.103729248,
+            },
+            "measured_precursor_intensity": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 2872807.5,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 75979424.0,
+            },
+            "copy_index": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    "metatlas",
+                    "c7dddd297e104ca79caea72a90150532",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
+                    2.2203779221,
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    "metatlas",
+                    "cf5e8df145f64bf0856fbf852d1bdb64",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5",
+                    3.0264527798,
+                ],
+            },
+        }
+    )
+    hits_plus.index = pd.MultiIndex.from_tuples(
+        hits_plus["copy_index"], names=["database", "id", "file_name", "msms_scan"]
+    )
+    hits_plus.drop(columns=["copy_index"], inplace=True)
+    return hits_plus
diff --git a/tests/system/__init__.py b/tests/system/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index edd3a452..e8e84344 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -1,3 +1,5 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+
 import os
 import subprocess
 
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index 7ad020ff..e6e0f535 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -1,5 +1,5 @@
-""" dill2plot tests """
-# pylint: disable=missing-function-docstring
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+
 from metatlas.plots import dill2plots
 
 
@@ -82,15 +82,15 @@ def test_export_atlas_to_spreadsheet(atlas):
     assert expected == dill2plots.export_atlas_to_spreadsheet(atlas).to_json().replace(r"\/", "/")
 
 
-def test_filter_atlas01(atlas_df, metatlas_dataset):
-    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 1, 3e5)) == 1
-    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 1, 4e5)) == 0
-    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 80, 1e4)) == 1
-    assert len(dill2plots.filter_atlas(atlas_df, metatlas_dataset, 90, 1e4)) == 0
+def test_filter_atlas01(metatlas_dataset):
+    assert len(dill2plots.filter_atlas(metatlas_dataset.atlas_df, metatlas_dataset, 1, 2.30e6)) == 1
+    assert len(dill2plots.filter_atlas(metatlas_dataset.atlas_df, metatlas_dataset, 1, 2.36e6)) == 0
+    assert len(dill2plots.filter_atlas(metatlas_dataset.atlas_df, metatlas_dataset, 73, 1e4)) == 1
+    assert len(dill2plots.filter_atlas(metatlas_dataset.atlas_df, metatlas_dataset, 74, 1e4)) == 0
 
 
 def test_strong_signal_compound_idxs(metatlas_dataset):
-    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 3e5) == [0]
-    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 4e5) == []
-    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 80, 1e4) == [0]
-    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 90, 1e4) == []
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 2.30e6) == [0]
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 2.36e6) == []
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 73, 1e4) == [0]
+    assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 74, 1e4) == []
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 70628706..729d54f8 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -1,3 +1,330 @@
 """ tests for MetatasDataset """
-# pylint: disable=missing-function-docstring
-# from metatlas.datastructures import metatlas_dataset as mads
+# pylint: disable=missing-function-docstring, protected-access
+
+import datetime
+import pandas as pd
+import pytest
+from metatlas.datastructures import metatlas_dataset as mads
+from metatlas.datastructures import metatlas_objects as metob
+
+
+def test_metatlas_dataset_build01(mocker, metatlas_dataset):
+    assert len(metatlas_dataset) == 1
+    assert len(metatlas_dataset[0]) == 1
+    assert metatlas_dataset[0][0]["identification"].compound[0].inchi_key == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+    assert metatlas_dataset[0][0]["data"]["ms1_summary"]["rt_peak"] == 2.2922415733
+    assert (
+        metatlas_dataset[0][0]["lcmsrun"].experiment
+        == "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
+    )
+
+
+@pytest.mark.xfail
+def test_metatlas_dataset_build02(mocker, atlas, group_with_2_lcmsruns, df_container):
+    # need to mock multiprocessing for this to work
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    metatlas_dataset = mads.MetatlasDataset(atlas, [group_with_2_lcmsruns], max_cpus=2)
+    assert len(metatlas_dataset) == 2
+    assert len(metatlas_dataset[0]) == 1
+
+
+def test_filter_compounds_ms1_notes_remove01(mocker, metatlas_dataset_with_2_cids, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    metatlas_dataset = metatlas_dataset_with_2_cids
+    metatlas_dataset.filter_compounds_ms1_notes_remove()
+    assert len(metatlas_dataset[0]) == 2
+    metatlas_dataset.set_note(1, "ms1_notes", "Remove")
+    metatlas_dataset.filter_compounds_ms1_notes_remove()
+    assert len(metatlas_dataset[0]) == 1
+
+
+def test_filter_compounds01(mocker, metatlas_dataset_with_2_cids, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    metatlas_dataset = metatlas_dataset_with_2_cids
+    metatlas_dataset.filter_compounds(remove_idxs=[])
+    assert len(metatlas_dataset[0]) == 2
+    metatlas_dataset.filter_compounds(keep_idxs=[0, 1])
+    assert len(metatlas_dataset[0]) == 2
+    metatlas_dataset.filter_compounds(keep_idxs=[])
+    assert len(metatlas_dataset[0]) == 0
+    with pytest.raises(ValueError):
+        metatlas_dataset.filter_compounds()
+
+
+def test_filter_compounds02(mocker, metatlas_dataset_with_2_cids, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    metatlas_dataset = metatlas_dataset_with_2_cids
+    with pytest.raises(ValueError):
+        metatlas_dataset.filter_compounds(keep_idxs=[0], remove_idxs=[1])
+
+
+def test_filter_compounds03(mocker, metatlas_dataset, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    with pytest.raises(IndexError):
+        metatlas_dataset.filter_compounds(keep_idxs=[999])
+
+
+def test_filter_compounds04(mocker, metatlas_dataset, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    with pytest.raises(IndexError):
+        metatlas_dataset.filter_compounds(remove_idxs=[999])
+
+
+def test_filter_hits_by_atlas01(mocker, metatlas_dataset_with_2_cids, hits, compound):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    hits = metatlas_dataset_with_2_cids.hits
+    start_num = len(hits)
+    metatlas_dataset_with_2_cids.filter_compounds(keep_idxs=[0])
+    assert start_num > len(metatlas_dataset_with_2_cids.hits)
+    metatlas_dataset_with_2_cids.filter_compounds(remove_idxs=[0])
+    assert len(metatlas_dataset_with_2_cids.hits) == 0
+
+
+def test_polarity(metatlas_dataset):
+    assert metatlas_dataset.polarity == "positive"
+    metatlas_dataset.filter_compounds(remove_idxs=[0])
+    assert len(metatlas_dataset[0]) == 0
+    assert metatlas_dataset.polarity == "positive"
+
+
+def test_extra_time_setter(metatlas_dataset, hits, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    metatlas_dataset.hits
+    assert metatlas_dataset._hits_valid
+    metatlas_dataset.extra_time = 0.3
+    assert not metatlas_dataset._hits_valid
+    metatlas_dataset.hits
+    assert metatlas_dataset._hits_valid
+
+
+def test_rts01(metatlas_dataset):
+    metatlas_dataset.set_rt(0, "rt_min", 9.99)
+    assert metatlas_dataset.rts[0].rt_min == 9.99
+    assert len(metatlas_dataset.rts) == 1
+
+
+def test_rts02(metatlas_dataset):
+    metatlas_dataset._atlas_df_valid = False
+    metatlas_dataset.set_rt(0, "rt_max", 9.99)
+    assert metatlas_dataset.rts[0].rt_max == 9.99
+    assert len(metatlas_dataset.rts) == 1
+
+
+def test_set_note01(metatlas_dataset):
+    metatlas_dataset.set_note(0, "ms2_notes", "Foobar")
+    assert metatlas_dataset[0][0]["identification"].ms2_notes == "Foobar"
+
+
+def test_set_note02(metatlas_dataset):
+    metatlas_dataset._atlas_df_valid = False
+    metatlas_dataset.set_note(0, "ms1_notes", "keeper")
+    assert metatlas_dataset[0][0]["identification"].ms1_notes == "keeper"
+
+
+def test_compound_indices_marked_remove01(metatlas_dataset):
+    assert len(metatlas_dataset.compound_indices_marked_remove()) == 0
+    metatlas_dataset.set_note(0, "ms1_notes", "REMOVE")
+    assert len(metatlas_dataset.compound_indices_marked_remove()) == 1
+
+
+def test_set_nested01():
+    with pytest.raises(ValueError):
+        mads._set_nested([], [], 0)
+
+
+def test_set_nested02(atlas):
+    mads._set_nested(atlas, ["compound_identifications", 0, ("compound",), 0, ("inchi_key",)], "FOOBAR")
+    assert atlas.compound_identifications[0].compound[0].inchi_key == "FOOBAR"
+
+
+def test_set_nested03(atlas):
+    mads._set_nested(atlas, ["name"], "My Atlas")
+    assert atlas.name == "My Atlas"
+
+
+def test_set_nested04(atlas):
+    with pytest.raises(TypeError):
+        mads._set_nested(atlas, ["zoop"], None)
+
+
+def test_set_nested05():
+    my_dict = {}
+    mads._set_nested(my_dict, ["zoop"], None)
+    assert my_dict["zoop"] is None
+
+
+def test_error_if_bad_idxs():
+    data = pd.DataFrame(data={"a": [1, 2], "b": [3, 4]})
+    mads._error_if_bad_idxs(data, [0])
+    with pytest.raises(IndexError):
+        mads._error_if_bad_idxs(data, [2])
+
+
+def test_is_remove():
+    assert not mads._is_remove([])
+    assert not mads._is_remove("foobar")
+    assert mads._is_remove("Remove")
+    assert mads._is_remove("REMOVE AND MORE")
+
+
+def test_duration_since():
+    assert mads._duration_since(datetime.datetime.now()) == "0.00 seconds"
+
+
+def test_filter_compounds_by_signal01(mocker, metatlas_dataset_with_2_cids, df_container, compound):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    assert len(metatlas_dataset_with_2_cids[0]) == 2
+    metatlas_dataset_with_2_cids.filter_compounds_by_signal(73, 2.30e6)
+    assert len(metatlas_dataset_with_2_cids[0]) == 1
+    metatlas_dataset_with_2_cids.filter_compounds_by_signal(73, 2.36e6)
+    assert len(metatlas_dataset_with_2_cids[0]) == 0
+
+
+def test_filter_compounds_by_signal02(mocker, metatlas_dataset_with_2_cids, df_container):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    assert len(metatlas_dataset_with_2_cids[0]) == 2
+    metatlas_dataset_with_2_cids.filter_compounds_by_signal(74, 1e5)
+    assert len(metatlas_dataset_with_2_cids[0]) == 0
+
+
+def test_export_atlas_to_csv01(metatlas_dataset, tmp_path):
+    out_file = tmp_path / "export.csv"
+    metatlas_dataset.export_atlas_to_csv(out_file)
+    in_df = pd.read_csv(out_file)
+    assert list(in_df.columns) == [
+        "Unnamed: 0",
+        "chebi_id",
+        "chebi_url",
+        "creation_time",
+        "description",
+        "formula",
+        "head_id",
+        "hmdb_id",
+        "hmdb_url",
+        "img_abc_id",
+        "inchi",
+        "inchi_key",
+        "iupac_name",
+        "kegg_id",
+        "kegg_url",
+        "last_modified",
+        "lipidmaps_id",
+        "lipidmaps_url",
+        "metacyc_id",
+        "mono_isotopic_molecular_weight",
+        "name",
+        "neutralized_2d_inchi",
+        "neutralized_2d_inchi_key",
+        "neutralized_inchi",
+        "neutralized_inchi_key",
+        "num_free_radicals",
+        "number_components",
+        "permanent_charge",
+        "prev_uid",
+        "pubchem_compound_id",
+        "pubchem_url",
+        "source",
+        "synonyms",
+        "unique_id",
+        "username",
+        "wikipedia_url",
+        "label",
+        "id_notes",
+        "ms1_notes",
+        "ms2_notes",
+        "identification_notes",
+        "rt_min",
+        "rt_max",
+        "rt_peak",
+        "mz",
+        "mz_tolerance",
+        "adduct",
+        "polarity",
+    ]
+    assert len(in_df) == 1
+    assert in_df.loc[0, "inchi_key"] == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+
+
+def test_setitem01(metatlas_dataset):
+    metatlas_dataset[0] = None
+    assert metatlas_dataset[0] is None
+
+
+def test_atlas_setter01(metatlas_dataset, atlas_with_2_cids):
+    metatlas_dataset.data
+    metatlas_dataset.atlas = atlas_with_2_cids
+    assert not metatlas_dataset._data_valid
+    assert len(metatlas_dataset[0]) == 2
+
+
+def test_atlas_setter02(metatlas_dataset):
+    with pytest.raises(TypeError):
+        metatlas_dataset.atlas = [1, 2]
+
+
+def test_groups01(metatlas_dataset):
+    assert metatlas_dataset.groups[0].short_name == "POS_Cone-S1"
+
+
+def test_set_groups01(metatlas_dataset):
+    metatlas_dataset.data
+    metatlas_dataset.groups = None
+    assert not metatlas_dataset._data_valid
+    assert metatlas_dataset.groups is None
+
+
+def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    metatlas_dataset.data
+    metatlas_dataset.hits
+    metatlas_dataset.extra_mz = 0.43
+    assert not metatlas_dataset._data_valid
+    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset.extra_mz == 0.43
+
+
+def test_set_keep_nonmatches_setter(metatlas_dataset, mocker, hits):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    metatlas_dataset.hits
+    metatlas_dataset.keep_nonmatches = False
+    assert not metatlas_dataset._hits_valid
+    assert not metatlas_dataset.keep_nonmatches
+
+
+def test_set_frag_mz_tolerance_setter(metatlas_dataset, mocker, hits):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    metatlas_dataset.hits
+    metatlas_dataset.frag_mz_tolerance = 1e-4
+    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset.frag_mz_tolerance == 1e-4
+
+
+def test_set_msms_refs_loc_setter(metatlas_dataset, mocker, hits):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
+    metatlas_dataset.hits
+    metatlas_dataset.msms_refs_loc = "/tmp/some_file.tab"
+    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset.msms_refs_loc == "/tmp/some_file.tab"
+
+
+def test_set_data01(metatlas_dataset):
+    metatlas_dataset.set_data([0, 0, "identification", "ms2_notes"], "extact match")
+    assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
+
+
+def test_store_atlas01(metatlas_dataset, sqlite):
+    metatlas_dataset.atlas.name = "test_store_atlas01"
+    metatlas_dataset.store_atlas()
+    atlas_list = metob.retrieve("atlases", name=metatlas_dataset.atlas.name, username="*")
+    assert len(atlas_list) == 1
+    metatlas_dataset.store_atlas(even_if_exists=True)
+    with pytest.raises(ValueError):
+        metatlas_dataset.store_atlas()
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index f34db9be..27a9c1f2 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -4,11 +4,10 @@
 from metatlas.io import metatlas_get_data_helper_fun as gdhf
 
 
-def test_make_atlas_df(atlas_two_compounds):
+def test_make_atlas_df(atlas_with_2_cids):
     # pylint: disable=line-too-long
-    expected = """{"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"compound_name":{"0":"2\'-deoxyadenosine"},"rt_max":{"0":2.6964640054},"rt_min":{"0":1.6964640054},"rt_peak":{"0":2.1964640054},"rt_units":{"0":"min"},"detected_polarity":{"0":"positive"},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"mz_tolerance_units":{"0":"ppm"},"mono_isotopic_molecular_weight":{"0":251.101839276},"pubchem_compound_id":{"0":"13730"},"synonyms":{"0":"2\'-deoxyadenosine"},"inchi":{"0":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1"},"adduct":{"0":"[M+H]+"},"label":{"0":"2\'-deoxyadenosine"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"bad match to ref"},"identification_notes":{"0":"my id note"}}"""  # noqa: E501
-    expected = """{"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","1":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"compound_name":{"0":"2\'-deoxyadenosine","1":"2\'-deoxyadenosine"},"rt_max":{"0":2.6964640054,"1":2.6964640054},"rt_min":{"0":1.6964640054,"1":1.6964640054},"rt_peak":{"0":2.1964640054,"1":2.1964640054},"rt_units":{"0":"min","1":"min"},"detected_polarity":{"0":"positive","1":"positive"},"mz":{"0":252.1091393,"1":252.1091393},"mz_tolerance":{"0":20.0,"1":20.0},"mz_tolerance_units":{"0":"ppm","1":"ppm"},"mono_isotopic_molecular_weight":{"0":251.101839276,"1":251.101839276},"pubchem_compound_id":{"0":"13730","1":"13730"},"synonyms":{"0":"2\'-deoxyadenosine","1":"2\'-deoxyadenosine"},"inchi":{"0":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1","1":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1"},"adduct":{"0":"[M+H]+","1":"[M+H]+"},"label":{"0":"2\'-deoxyadenosine","1":"2\'-deoxyadenosine"},"ms1_notes":{"0":"keep","1":"keep"},"ms2_notes":{"0":"bad match to ref","1":"bad match to ref"},"identification_notes":{"0":"my id note","1":"my id note"}}"""  # noqa: E501
-    assert expected == gdhf.make_atlas_df(atlas_two_compounds).to_json()
+    expected = """{"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","1":"OIRDTQYFTABQOQ-KQYNXXCUSA-N"},"compound_name":{"0":"2\'-deoxyadenosine","1":"adenosine"},"rt_max":{"0":2.6964640054,"1":3.523318408},"rt_min":{"0":1.6964640054,"1":2.523318408},"rt_peak":{"0":2.1964640054,"1":3.023318408},"rt_units":{"0":"min","1":"min"},"detected_polarity":{"0":"positive","1":"positive"},"mz":{"0":252.1091393,"1":268.1040539},"mz_tolerance":{"0":20.0,"1":20.0},"mz_tolerance_units":{"0":"ppm","1":"ppm"},"mono_isotopic_molecular_weight":{"0":251.101839276,"1":267.096753896},"pubchem_compound_id":{"0":"13730","1":"60961"},"synonyms":{"0":"2\'-deoxyadenosine","1":"adenosine\\/\\/\\/58-61-7\\/\\/\\/Adenocard\\/\\/\\/Adenoscan"},"inchi":{"0":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1","1":"InChI=1S\\/C10H13N5O4\\/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10\\/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)\\/t4-,6-,7-,10-\\/m1\\/s1"},"adduct":{"0":"[M+H]+","1":"[M+H]+"},"label":{"0":"2\'-deoxyadenosine","1":"adenosine"},"ms1_notes":{"0":"keep","1":""},"ms2_notes":{"0":"bad match to ref","1":""},"identification_notes":{"0":"my id note","1":""}}"""  # noqa: E501
+    assert expected == gdhf.make_atlas_df(atlas_with_2_cids).to_json()
 
 
 def test_transfer_identification_data_to_atlas(metatlas_dataset, atlas):

From 94a3cf362b3f335e310a1587fc122d62d5166fa3 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 19 May 2021 19:07:06 -0700
Subject: [PATCH 006/177] Remove conda from environment setup

---
 .github/workflows/ci.yaml |  4 +---
 docker/Dockerfile         | 15 ++++++++++-----
 docker/requirements.txt   | 24 ++++++++++++++++++++++++
 noxfile.py                | 34 ++++++++++++++++------------------
 4 files changed, 51 insertions(+), 26 deletions(-)
 create mode 100644 docker/requirements.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index eb3bffe6..3282243c 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -10,13 +10,11 @@ jobs:
     steps:
       - name: Checkout source
         uses: actions/checkout@v2
-      - name: Cache conda environment for unit tests
+      - name: Cache nox environment for unit tests
         uses: actions/cache@v2
         with:
           path: ~/work/metatlas/metatlas/.nox
           key: ${{ runner.os }}-nox
-      - name: Install miniconda
-        uses: conda-incubator/setup-miniconda@v2
       - name: Setup nox
         uses: excitedleigh/setup-nox@v2.0.0
       - name: Run unit tests
diff --git a/docker/Dockerfile b/docker/Dockerfile
index cd3ff381..6cd3e411 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM mambaorg/micromamba:0.13.0
+FROM python:3.8-slim-buster
 
 ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci01
 ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
@@ -8,10 +8,12 @@ ENV METATLAS_LOCAL=True
 
 EXPOSE 8888
 
-COPY metatlas_env.yaml /metatlas_env.yaml
+RUN apt-get update && apt-get install -y jq && \
+    rm -rf /var/lib/apt/lists/*
 
-RUN micromamba install -y -n base -f /metatlas_env.yaml && \
-    micromamba clean --all --yes
+COPY requirements.txt /requirements.txt
+
+RUN pip install --quiet -r requirements.txt
 
 RUN mkdir -p /io /src /work $REFS_DIR $H5_DIR
 
@@ -26,4 +28,7 @@ ADD $BASE_DATA_URL/meta_atlas.sqlite3 /work/root_workspace.db
 
 WORKDIR /work
 
-CMD ["/opt/conda/bin/jupyter", "nbclassic", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
+RUN apt-get update && apt-get install -y libxrender1 && \
+    rm -rf /var/lib/apt/lists/*
+
+CMD ["/usr/local/bin/jupyter", "nbclassic", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 00000000..916725ca
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,24 @@
+alembic==1.5.8
+banal==1.0.6
+colorama==0.4.4
+dill==0.3.3
+gspread==3.7.0
+h5py==3.2.1
+humanize==3.5.0
+ipympl==0.7.0
+ipywidgets==7.6.3
+jupyterlab==3.0.14
+matplotlib==3.4.1
+oauth2client==4.1.3
+pandas==1.2.4
+papermill==2.3.3
+pip==21.1.1
+pymysql==1.0.2
+pyyaml==5.4.1
+rdkit-pypi==2021.3.1.5
+scipy==1.6.3
+sqlalchemy==1.4.11
+tables==3.6.1
+tabulate==0.8.9
+xlsxwriter==1.4.0
+dataset==1.5.0
diff --git a/noxfile.py b/noxfile.py
index f4a5c3fe..0e4d2415 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -40,6 +40,10 @@
         'toml==0.10.2',
         ]
 
+pylint_deps = [
+        'pylint==2.8.2',
+        ]
+
 nbqa_deps = [
         'nbqa==0.8.1',
         'tokenize-rt==4.1.0',
@@ -51,11 +55,11 @@
         ]
 
 flake8_deps = [
-    'flake8',
-    'flake8-bugbear',
-    'flake8-builtins',
-    'flake8-comprehensions',
-    ]
+        'flake8==3.9.2',
+        'flake8-bugbear==21.4.3',
+        'flake8-builtins==1.5.3',
+        'flake8-comprehensions==3.5.0',
+        ]
 
 nox.options.error_on_external_run = True
 
@@ -97,19 +101,15 @@ def blacken(session):
     session.run('black', *more_checks)
 
 
-@nox.session(venv_backend='conda', python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=True)
 def pylint(session):
-    session.run('conda', 'env', 'update', '--prefix', session.virtualenv.location,
-                '--file', 'docker/metatlas_env.yaml', silent=True)
-    session.install('--no-deps', *pytest_deps)
+    session.install('-r', 'docker/requirements.txt', *pylint_deps)
     session.run('pylint', *more_checks)
 
 
-@nox.session(venv_backend='conda', python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=True)
 def pylint_nb(session):
-    session.run('conda', 'env', 'update', '--prefix', session.virtualenv.location,
-                '--file', 'docker/metatlas_env.yaml', silent=True)
-    session.install('--no-deps', *nbqa_deps, 'pylint')
+    session.install('-r', 'docker/requirements.txt', *nbqa_deps, *pylint_deps)
     session.run('nbqa', 'pylint', *notebooks)
 
 
@@ -132,18 +132,16 @@ def blacken_nb(session):
     session.run('nbqa', 'black', '--nbqa-mutate', *notebooks)
 
 
-@nox.session(venv_backend='conda', python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=True)
 def unit_tests(session):
-    session.run('conda', 'env', 'update', '--prefix', session.virtualenv.location,
-                '--file', 'docker/metatlas_env.yaml', silent=True)
-    session.install('--no-deps', *pytest_deps)
+    session.install('-r', 'docker/requirements.txt', *pytest_deps)
     session.run('pytest', '-vv', *session.posargs, '--cov', 'metatlas', 'tests/unit/',
                 env={'METATLAS_LOCAL': 'TRUE'})
 
 
 @nox.session(python=py_versions[0])
 def system_tests(session):
-    session.install('--no-deps', *pytest_deps)
+    session.install(*pytest_deps)
     session.run('pytest', '-vv', *session.posargs, 'tests/system/')
 
 

From 3a20b8d2b11ce273734bca2f784860dc831cb751 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 20 May 2021 16:28:05 -0700
Subject: [PATCH 007/177] WIP - Add Groups class

Start moving metatlas_dataset to immutable tuples
instead of lists.
---
 metatlas/datastructures/groups.py             | 103 ++++++++++++++++++
 metatlas/datastructures/metatlas_dataset.py   |  10 +-
 metatlas/io/metatlas_get_data_helper_fun.py   |   2 +-
 tests/fixtures/database.py                    |   2 +-
 tests/{ => unit}/conftest.py                  |   0
 tests/unit/test_metatlas_dataset.py           |  27 ++---
 .../unit/test_metatlas_get_data_helper_fun.py |   4 +-
 7 files changed, 119 insertions(+), 29 deletions(-)
 create mode 100644 metatlas/datastructures/groups.py
 rename tests/{ => unit}/conftest.py (100%)

diff --git a/metatlas/datastructures/groups.py b/metatlas/datastructures/groups.py
new file mode 100644
index 00000000..50da6a1e
--- /dev/null
+++ b/metatlas/datastructures/groups.py
@@ -0,0 +1,103 @@
+""" Object Oriented Groups"""
+import logging
+import pandas as pd
+
+from metatlas.datastructures import metatlas_objects as metob
+from metatlas.plots import dill2plots as dp
+
+logger = logging.getLogger(__name__)
+
+
+class Groups:
+    """Groups of LCMS run to define related experimental samples"""
+    def __init__(self, experiment, analysis_id, controlled_vocab, exclude_files):
+        self.experiment = experiment
+        self.analysis_id = analysis_id
+        self.controlled_vocab = controlled_vocab
+        self.exclude_files = exclude_files
+
+    def query_lcmsruns(self, most_recent=True):
+        """Get LCMS runs from DB matching experiment"""
+        return dp.get_metatlas_files(experiment=self.experiment, name='%', most_recent=most_recent)
+
+    def get_lcmsruns_df(self, most_recent=True):
+        """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
+        files = dp.get_metatlas_files(experiment=self.experiment, name='%', most_recent=most_recent)
+        logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(files))
+        return metob.to_dataframe(files)
+
+    lcmsruns_df = property(get_lcmsruns_df)
+
+    def get_lcmsruns_short_names(self, fields=None):
+        """
+        Querys DB for lcms filenames from self.experiment and returns
+        a pandas DataFrame containing identifiers for each file
+        inputs:
+            fields: optional dict with column names as key
+                    and list of lcms filename metadata fields positions as value
+        """
+        if fields is None:
+            fields = {'full_filename': range(16),
+                      'sample_treatment': [12],
+                      'short_filename': [0, 2, 4, 5, 7, 9, 14],
+                      'short_samplename': [0, 2, 4, 5, 7, 9, 14],
+                      }
+        out = pd.DataFrame(columns=fields.keys())
+        for i, lcms_file in enumerate(self.query_lcmsruns()):
+            tokens = lcms_file.name.split('.')[0].split('_')
+            for name, idxs in fields.items():
+                out.loc[i, name] = "_".join([tokens[n] for n in idxs])
+            out.loc[i, 'last_modified'] = pd.to_datetime(lcms_file.last_modified, unit='s')
+        out.sort_values(by='last_modified', inplace=True)
+        out.drop(columns=['last_modified'], inplace=True)
+        out.drop_duplicates(subset=['full_filename'], keep='last', inplace=True)
+        out.set_index('full_filename', inplace=True)
+        return out
+
+    lcmsruns_short_names = property(get_lcmsruns_short_names)
+
+    def get_group_name(self, base_filename):
+        """Returns dict with keys group and short_name corresponding to base_filename"""
+        indices = [i for i, s in enumerate(self.controlled_vocab) if s.lower() in base_filename.lower()]
+        tokens = base_filename.split('_')
+        prefix = '_'.join(tokens[:11])
+        suffix = self.controlled_vocab[indices[0]].lstrip('_') if indices else tokens[12]
+        group_name = f"{prefix}_{self.analysis_id}_{suffix}"
+        short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
+        return {'group': group_name, 'short_name': short_name}
+
+    @property
+    def _get_files_dict(self):
+        """
+        Queries DB for all lcmsruns matching the class properties.
+        Returns a dict of dicts where keys are filenames minus extensions and values are
+        dicts with keys: name (filename with extension), group, and short_name
+        """
+        file_dict = {}
+        for lcms_file in self.query_lcmsruns():
+            if not any(map(lcms_file.name.__contains__, self.exclude_files)):
+                base_name = lcms_file.name.split('.')[0]
+                file_dict[base_name] = {'name': lcms_file.name, **self.get_group_name(base_name)}
+        return file_dict
+
+    @property
+    def df(self):  # pylint: disable=invalid-name
+        """Returns pandas Dataframe with one group per row"""
+        out = pd.DataFrame(self._get_files_dict).T
+        out.index.name = 'filename'
+        out.reset_index(inplace=True)
+        out.drop(columns=['name'], inplace=True)
+        return out
+
+    @property
+    def group_objects(self):
+        """Returns a list of Group objects"""
+        file_dict = self._get_files_dict
+        out = []
+        for group_name, values in self.df.to_dict('index'):
+            out.append(metob.Group(name=group_name,
+                                   short_name=values['short_name'],
+                                   items=[file_value['name']
+                                          for file_value in file_dict.values()
+                                          if file_value['group'] == group_name]))
+        return out
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index d8b57592..9e963979 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -80,7 +80,7 @@ def _build(self):
                 samples = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
         else:  # skip multiprocessing as this makes for easier debugging
             samples = [ma_data.get_data_for_atlas_df_and_file(i) for i in files]
-        self._data = [MetatlasSample(x) for x in samples]
+        self._data = tuple(MetatlasSample(x) for x in samples)
         logger.info(
             "MetatlasDataset with %d files built in %s.",
             len(files),
@@ -210,10 +210,6 @@ def __getitem__(self, idx):
         """get sample at idx"""
         return self.data[idx]
 
-    def __setitem__(self, idx, value):
-        """assign value for sample at idx"""
-        self.data[idx] = value
-
     def _set_and_invalidate_properties(self, attribute_name, new_value, property_names):
         """
         inputs:
@@ -434,10 +430,6 @@ def __getitem__(self, idx):
         """get sample at idx"""
         return self._data[idx]
 
-    def __setitem__(self, idx, value):
-        """assign value for sample at idx"""
-        self._data[idx] = value
-
     def __len__(self):
         """len is from data"""
         return len(self._data)
diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 47bf8a87..a2e9a8dc 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -108,7 +108,7 @@ def get_data_for_atlas_df_and_file(input_tuple):
         else:
             result['data']['msms']['data'] = []
         row.append(result)
-    return row
+    return tuple(row)
 
 def get_bpc(filename,dataset='ms1_pos',integration='bpc'):
     """
diff --git a/tests/fixtures/database.py b/tests/fixtures/database.py
index d4c26d7e..0d7f1d20 100644
--- a/tests/fixtures/database.py
+++ b/tests/fixtures/database.py
@@ -2,8 +2,8 @@
 
 import getpass
 import os
-import pytest
 import sqlite3
+import pytest
 from metatlas.datastructures import metatlas_objects as metob
 
 
diff --git a/tests/conftest.py b/tests/unit/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to tests/unit/conftest.py
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 729d54f8..3b7b286e 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -8,7 +8,7 @@
 from metatlas.datastructures import metatlas_objects as metob
 
 
-def test_metatlas_dataset_build01(mocker, metatlas_dataset):
+def test_metatlas_dataset_build01(metatlas_dataset):
     assert len(metatlas_dataset) == 1
     assert len(metatlas_dataset[0]) == 1
     assert metatlas_dataset[0][0]["identification"].compound[0].inchi_key == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
@@ -92,11 +92,11 @@ def test_polarity(metatlas_dataset):
 
 def test_extra_time_setter(metatlas_dataset, hits, mocker):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
-    metatlas_dataset.hits
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     assert metatlas_dataset._hits_valid
     metatlas_dataset.extra_time = 0.3
     assert not metatlas_dataset._hits_valid
-    metatlas_dataset.hits
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     assert metatlas_dataset._hits_valid
 
 
@@ -253,13 +253,8 @@ def test_export_atlas_to_csv01(metatlas_dataset, tmp_path):
     assert in_df.loc[0, "inchi_key"] == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
 
 
-def test_setitem01(metatlas_dataset):
-    metatlas_dataset[0] = None
-    assert metatlas_dataset[0] is None
-
-
 def test_atlas_setter01(metatlas_dataset, atlas_with_2_cids):
-    metatlas_dataset.data
+    metatlas_dataset.data  # pylint: disable=pointless-statement
     metatlas_dataset.atlas = atlas_with_2_cids
     assert not metatlas_dataset._data_valid
     assert len(metatlas_dataset[0]) == 2
@@ -275,7 +270,7 @@ def test_groups01(metatlas_dataset):
 
 
 def test_set_groups01(metatlas_dataset):
-    metatlas_dataset.data
+    metatlas_dataset.data  # pylint: disable=pointless-statement
     metatlas_dataset.groups = None
     assert not metatlas_dataset._data_valid
     assert metatlas_dataset.groups is None
@@ -283,8 +278,8 @@ def test_set_groups01(metatlas_dataset):
 
 def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
-    metatlas_dataset.data
-    metatlas_dataset.hits
+    metatlas_dataset.data  # pylint: disable=pointless-statement
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.extra_mz = 0.43
     assert not metatlas_dataset._data_valid
     assert not metatlas_dataset._hits_valid
@@ -293,7 +288,7 @@ def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
 
 def test_set_keep_nonmatches_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
-    metatlas_dataset.hits
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.keep_nonmatches = False
     assert not metatlas_dataset._hits_valid
     assert not metatlas_dataset.keep_nonmatches
@@ -301,7 +296,7 @@ def test_set_keep_nonmatches_setter(metatlas_dataset, mocker, hits):
 
 def test_set_frag_mz_tolerance_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
-    metatlas_dataset.hits
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.frag_mz_tolerance = 1e-4
     assert not metatlas_dataset._hits_valid
     assert metatlas_dataset.frag_mz_tolerance == 1e-4
@@ -309,7 +304,7 @@ def test_set_frag_mz_tolerance_setter(metatlas_dataset, mocker, hits):
 
 def test_set_msms_refs_loc_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
-    metatlas_dataset.hits
+    metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.msms_refs_loc = "/tmp/some_file.tab"
     assert not metatlas_dataset._hits_valid
     assert metatlas_dataset.msms_refs_loc == "/tmp/some_file.tab"
@@ -320,7 +315,7 @@ def test_set_data01(metatlas_dataset):
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
 
 
-def test_store_atlas01(metatlas_dataset, sqlite):
+def test_store_atlas01(metatlas_dataset, sqlite):  # pylint: disable=unused-argument
     metatlas_dataset.atlas.name = "test_store_atlas01"
     metatlas_dataset.store_atlas()
     atlas_list = metob.retrieve("atlases", name=metatlas_dataset.atlas.name, username="*")
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 27a9c1f2..22178377 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -41,8 +41,8 @@ def test_set_nested_term_attr_tuple(metatlas_dataset):
 
 
 def test_set_nested_term_list(metatlas_dataset):
-    gdhf.set_nested(metatlas_dataset, [0, 0], "foobar")
-    assert metatlas_dataset[0][0] == "foobar"
+    gdhf.set_nested(metatlas_dataset, [0, 0, "identification", "mz_references"], None)
+    assert metatlas_dataset[0][0]["identification"].mz_references is None
 
 
 def test_set_nested_term_dict(metatlas_dataset):

From 205adb0ca3a7253be96398a4009ae7c5e0e7ae79 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 2 Jun 2021 17:28:01 -0700
Subject: [PATCH 008/177] WIP - group moved to MetatlasDataset

move output generation into package
add notebook tests to nox
check for existing files before writing
---
 .pre-commit-config.yaml                       |   2 +-
 docker/metatlas_env.yaml                      |  15 +-
 docker/requirements.txt                       |   6 +-
 metatlas/datastructures/groups.py             | 103 ----
 metatlas/datastructures/metatlas_dataset.py   | 351 ++++++++++-
 metatlas/io/metatlas_get_data_helper_fun.py   |  66 ++-
 metatlas/io/targeted_output.py                | 275 +++++++++
 metatlas/io/write_utils.py                    |  80 +++
 metatlas/plots/chromplotplus.py               |  15 +-
 metatlas/plots/dill2plots.py                  | 549 +++++++-----------
 metatlas/tools/fastanalysis.py                | 100 ++--
 metatlas/tools/notebook.py                    |  81 +++
 .../kernels/metatlas-targeted.kernel.json     |  11 +
 notebooks/reference/Targeted.ipynb            | 529 ++---------------
 ...book_Metatlas_Stable_v0.1.0_20210303.ipynb |  10 +-
 noxfile.py                                    | 170 +++---
 .../fixtures/analysis_identifiers_fixtures.py |  18 +
 tests/fixtures/metatlas_dataset_fixtures.py   |  21 +-
 tests/system/test_targeted.py                 |  32 +-
 tests/unit/test_metatlas_dataset.py           | 202 ++++++-
 .../unit/test_metatlas_get_data_helper_fun.py |  58 ++
 tests/unit/test_targeted_output.py            |   9 +
 tests/unit/test_write_utils.py                |  86 +++
 23 files changed, 1633 insertions(+), 1156 deletions(-)
 delete mode 100644 metatlas/datastructures/groups.py
 create mode 100644 metatlas/io/targeted_output.py
 create mode 100644 metatlas/io/write_utils.py
 create mode 100644 metatlas/tools/notebook.py
 create mode 100644 notebooks/kernels/metatlas-targeted.kernel.json
 create mode 100644 tests/fixtures/analysis_identifiers_fixtures.py
 create mode 100644 tests/unit/test_targeted_output.py
 create mode 100644 tests/unit/test_write_utils.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1728c754..200de1cf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.0.0
+  rev: v4.0.1
   hooks:
   - id: check-added-large-files
   - id: check-ast
diff --git a/docker/metatlas_env.yaml b/docker/metatlas_env.yaml
index a934e16b..ab42e967 100644
--- a/docker/metatlas_env.yaml
+++ b/docker/metatlas_env.yaml
@@ -3,9 +3,10 @@ name: metatlas
 channels:
   - conda-forge
 dependencies:
-  - alembic=1.5.8
+  - alembic=1.6.2
   - banal=1.0.6
   - colorama=0.4.4
+  - dataset=1.5.0
   - dill=0.3.3
   - gspread=3.7.0
   - hdf5=1.10.6
@@ -13,8 +14,8 @@ dependencies:
   - ipympl=0.7.0
   - ipywidgets=7.6.3
   - jq=1.6
-  - jupyterlab=3.0.14
-  - matplotlib=3.4.1
+  - jupyterlab=3.0.16
+  - matplotlib=3.4.2
   - oauth2client=4.1.3
   - pandas=1.2.4
   - papermill=2.3.3
@@ -22,10 +23,8 @@ dependencies:
   - pymysql=1.0.2
   - pytables=3.6.1
   - pyyaml=5.4.1
-  - rdkit=2021.03.1
+  - rdkit=2021.03.2
   - scipy=1.6.3
-  - sqlalchemy=1.4.11
+  - sqlalchemy=1.4.15
   - tabulate=0.8.9
-  - xlsxwriter=1.4.0
-  - pip:
-    - dataset==1.5.0
+  - xlsxwriter=1.4.3
diff --git a/docker/requirements.txt b/docker/requirements.txt
index 916725ca..0922070a 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -1,4 +1,4 @@
-alembic==1.5.8
+alembic==1.6.2
 banal==1.0.6
 colorama==0.4.4
 dill==0.3.3
@@ -7,8 +7,8 @@ h5py==3.2.1
 humanize==3.5.0
 ipympl==0.7.0
 ipywidgets==7.6.3
-jupyterlab==3.0.14
-matplotlib==3.4.1
+jupyterlab==3.0.16
+matplotlib==3.4.2
 oauth2client==4.1.3
 pandas==1.2.4
 papermill==2.3.3
diff --git a/metatlas/datastructures/groups.py b/metatlas/datastructures/groups.py
deleted file mode 100644
index 50da6a1e..00000000
--- a/metatlas/datastructures/groups.py
+++ /dev/null
@@ -1,103 +0,0 @@
-""" Object Oriented Groups"""
-import logging
-import pandas as pd
-
-from metatlas.datastructures import metatlas_objects as metob
-from metatlas.plots import dill2plots as dp
-
-logger = logging.getLogger(__name__)
-
-
-class Groups:
-    """Groups of LCMS run to define related experimental samples"""
-    def __init__(self, experiment, analysis_id, controlled_vocab, exclude_files):
-        self.experiment = experiment
-        self.analysis_id = analysis_id
-        self.controlled_vocab = controlled_vocab
-        self.exclude_files = exclude_files
-
-    def query_lcmsruns(self, most_recent=True):
-        """Get LCMS runs from DB matching experiment"""
-        return dp.get_metatlas_files(experiment=self.experiment, name='%', most_recent=most_recent)
-
-    def get_lcmsruns_df(self, most_recent=True):
-        """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
-        files = dp.get_metatlas_files(experiment=self.experiment, name='%', most_recent=most_recent)
-        logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(files))
-        return metob.to_dataframe(files)
-
-    lcmsruns_df = property(get_lcmsruns_df)
-
-    def get_lcmsruns_short_names(self, fields=None):
-        """
-        Querys DB for lcms filenames from self.experiment and returns
-        a pandas DataFrame containing identifiers for each file
-        inputs:
-            fields: optional dict with column names as key
-                    and list of lcms filename metadata fields positions as value
-        """
-        if fields is None:
-            fields = {'full_filename': range(16),
-                      'sample_treatment': [12],
-                      'short_filename': [0, 2, 4, 5, 7, 9, 14],
-                      'short_samplename': [0, 2, 4, 5, 7, 9, 14],
-                      }
-        out = pd.DataFrame(columns=fields.keys())
-        for i, lcms_file in enumerate(self.query_lcmsruns()):
-            tokens = lcms_file.name.split('.')[0].split('_')
-            for name, idxs in fields.items():
-                out.loc[i, name] = "_".join([tokens[n] for n in idxs])
-            out.loc[i, 'last_modified'] = pd.to_datetime(lcms_file.last_modified, unit='s')
-        out.sort_values(by='last_modified', inplace=True)
-        out.drop(columns=['last_modified'], inplace=True)
-        out.drop_duplicates(subset=['full_filename'], keep='last', inplace=True)
-        out.set_index('full_filename', inplace=True)
-        return out
-
-    lcmsruns_short_names = property(get_lcmsruns_short_names)
-
-    def get_group_name(self, base_filename):
-        """Returns dict with keys group and short_name corresponding to base_filename"""
-        indices = [i for i, s in enumerate(self.controlled_vocab) if s.lower() in base_filename.lower()]
-        tokens = base_filename.split('_')
-        prefix = '_'.join(tokens[:11])
-        suffix = self.controlled_vocab[indices[0]].lstrip('_') if indices else tokens[12]
-        group_name = f"{prefix}_{self.analysis_id}_{suffix}"
-        short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
-        return {'group': group_name, 'short_name': short_name}
-
-    @property
-    def _get_files_dict(self):
-        """
-        Queries DB for all lcmsruns matching the class properties.
-        Returns a dict of dicts where keys are filenames minus extensions and values are
-        dicts with keys: name (filename with extension), group, and short_name
-        """
-        file_dict = {}
-        for lcms_file in self.query_lcmsruns():
-            if not any(map(lcms_file.name.__contains__, self.exclude_files)):
-                base_name = lcms_file.name.split('.')[0]
-                file_dict[base_name] = {'name': lcms_file.name, **self.get_group_name(base_name)}
-        return file_dict
-
-    @property
-    def df(self):  # pylint: disable=invalid-name
-        """Returns pandas Dataframe with one group per row"""
-        out = pd.DataFrame(self._get_files_dict).T
-        out.index.name = 'filename'
-        out.reset_index(inplace=True)
-        out.drop(columns=['name'], inplace=True)
-        return out
-
-    @property
-    def group_objects(self):
-        """Returns a list of Group objects"""
-        file_dict = self._get_files_dict
-        out = []
-        for group_name, values in self.df.to_dict('index'):
-            out.append(metob.Group(name=group_name,
-                                   short_name=values['short_name'],
-                                   items=[file_value['name']
-                                          for file_value in file_dict.values()
-                                          if file_value['group'] == group_name]))
-        return out
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 9e963979..947ef352 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -1,20 +1,118 @@
 """ object oriented interface to metatlas_dataset """
 import datetime
 import getpass
+import glob
 import logging
 import multiprocessing
+import numbers
+import os
+import shutil
 
 import humanize
 import pandas as pd
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.io import targeted_output
+from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
 logger = logging.getLogger(__name__)
 
 
+class AnalysisIdentifiers:
+    """Names used in generating an analysis"""
+
+    # pylint: disable=too-many-arguments
+    def __init__(
+        self, experiment, output_type, polarity, analysis_number, project_directory, atlas=None, username=None
+    ):
+        self._experiment = experiment
+        self._output_type = output_type
+        self._polarity = polarity
+        self._analysis_number = analysis_number
+        self._atlas = atlas
+        self._username = getpass.getuser() if username is None else username
+        self.project_directory = project_directory
+        self.validate()
+        logger.info(
+            "IDs: atlas=%s, short_experiment_analysis=%s, output_dir=%s",
+            self.atlas,
+            self.short_experiment_analysis,
+            self.output_dir,
+        )
+
+    def validate(self):
+        """Valid class inputs"""
+        if len(self.experiment.split("_")) != 9:
+            raise ValueError('Parameter experiment does contain 9 fields when split on "_".')
+        if self.output_type not in ["ISTDsEtc", "FinalEMA-HILIC"]:
+            raise ValueError('Parameter output_type is not one of "ISTDsEtc" or "FinalEMA-HILIC".')
+        if self.polarity not in ["positive", "negative"]:
+            raise ValueError('Parameter polarity is not one of "positive" or "negative".')
+        if not isinstance(self.analysis_number, numbers.Integral):
+            raise TypeError("Parameter analysis_number is not an integer.")
+        if self.analysis_number < 0:
+            raise ValueError("Parameter analysis_number cannot be negative.")
+
+    @property
+    def experiment(self):
+        """Returns experiment identifier"""
+        return self._experiment
+
+    @property
+    def output_type(self):
+        """Returns output type identifier"""
+        return self._output_type
+
+    @property
+    def polarity(self):
+        """Returns polarity identifier"""
+        return self._polarity
+
+    @property
+    def analysis_number(self):
+        """Returns analysis number"""
+        return self._analysis_number
+
+    @property
+    def atlas(self):
+        """Atlas identifier (name)"""
+        if self._atlas is None:
+            exp_tokens = self.experiment.split("_")
+            return f"{'_'.join(exp_tokens[3:6])}_{self.short_polarity}_{self.analysis_number}"
+        return self._atlas
+
+    @property
+    def username(self):
+        """Returns username identifier"""
+        return self._username
+
+    @property
+    def analysis(self):
+        """Analysis identifier"""
+        return f"{self.username}{self.analysis_number}"
+
+    @property
+    def short_experiment_analysis(self):
+        """Short experiment analysis identifier"""
+        exp_tokens = self.experiment.split("_")
+        return f"{exp_tokens[0]}_{exp_tokens[3]}_{self.analysis}"
+
+    @property
+    def short_polarity(self):
+        """Short polarity identifier: first 3 letters, upper case"""
+        return self.polarity[:3].upper()
+
+    @property
+    def output_dir(self):
+        """Creates the output directory and returns the path as a string"""
+        out = os.path.join(self.project_directory, self.experiment, self.analysis, self.output_type)
+        os.makedirs(out, exist_ok=True)
+        return out
+
+
 class MetatlasDataset:
     """
     Like the non-object oriented metatlas_dataset, you can index into this class by file_idx and compound_idx:
@@ -35,8 +133,10 @@ class MetatlasDataset:
     # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods
     def __init__(
         self,
-        atlas,
-        groups,
+        ids,
+        atlas=None,
+        groups_controlled_vocab=None,
+        exclude_files=None,
         extra_time=0.75,
         extra_mz=0,
         keep_nonmatches=True,
@@ -44,26 +144,85 @@ def __init__(
         msms_refs_loc=MSMS_REFS_PATH,
         max_cpus=1,
     ):
-        self._atlas = atlas
+        """
+        inputs:
+            ids: AnalysisIdentifiers instance defining the analysis
+            groups_controlled_vocab: array of strings that will group together when creating groups
+                                     application of groups_controlled_vocab is case insensitive
+            exclude_files: array of strings that will exclude files if they are substrings of the filename
+        """
+        self.ids = ids
+        self._atlas = self._get_atlas() if atlas is None else atlas
         self._atlas_df = None
         self._atlas_df_valid = False
+        self._runs = None
+        self._runs_valid = False
         self._data = None
         self._data_valid = False
         self._hits = None
         self._hits_valid = False
-        self._groups = groups
+        self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
+        self._exclude_files = [] if exclude_files is None else exclude_files
         self._extra_time = extra_time
         self._extra_mz = extra_mz
         self._keep_nonmatches = keep_nonmatches
         self._frag_mz_tolerance = frag_mz_tolerance
         self._msms_refs_loc = msms_refs_loc
         self.max_cpus = max_cpus
+        self.write_data_source_files()
+        self.write_lcmsruns_short_names()
+
+    def write_data_source_files(self):
+        """Write the data source files if they don't already exist"""
+        data_sources_dir = os.path.join(self.ids.output_dir, f"{self.ids.short_polarity}_data_sources")
+        if len(glob.glob(os.path.join(data_sources_dir, "*"))) >= 4:
+            logger.warning(
+                (
+                    "Data sources directory already populated from previous work on this analysis. "
+                    "Not overwritting."
+                )
+            )
+        else:
+            shutil.rmtree(data_sources_dir, ignore_errors=True)
+            logger.info("Writing data source files to %s.", data_sources_dir)
+            ma_data.make_data_sources_tables(
+                self.groups, self.atlas, self.ids.output_dir, self.ids.short_polarity
+            )
+
+    def write_lcmsruns_short_names(self):
+        """Write short names and raise error if exists and differs from current data"""
+        write_utils.export_dataframe_die_on_diff(
+            self.lcmsruns_short_names,
+            os.path.join(self.ids.output_dir, "short_names.csv"),
+            "LCMS runs short names",
+            # index=True,
+        )
+
+    def _get_atlas(self):
+        """Load atlas from database"""
+        name_query = f"%_{self.ids.short_polarity}_{self.ids.short_experiment_analysis}"
+        atlases = metob.retrieve("atlases", name=name_query, username=self.ids.username)
+        if len(atlases) == 0:
+            logger.error(
+                'Database does not contain an atlas named "%s" and owned by %s.',
+                name_query,
+                self.ids.username,
+            )
+            raise ValueError("Atlas not found in database")
+        if len(atlases) > 1:
+            logger.error(
+                'Database contains more than one atlas named "%s" and owned by %s.',
+                name_query,
+                self.ids.username,
+            )
+            raise ValueError("Too many matching atlases found in database")
+        return atlases[0]
 
     def _build(self):
         """Populate self._data from database and h5 files."""
         start_time = datetime.datetime.now()
         files = []
-        for group in self._groups:
+        for group in self.groups:
             for h5_file in group.items:
                 files.append(
                     (
@@ -215,7 +374,7 @@ def _set_and_invalidate_properties(self, attribute_name, new_value, property_nam
         inputs:
             attribute_name: name of the class attribute being modified
             new_value: value to assign to attribute
-            propert_names: list of names of the class propertys that are dependent on the attribute's value
+            property_names: list of names of the class properties that are dependent on the attribute's value
         side effects:
             If the property is valid and new_value is different from previous value, then invalidate.
             And set attribute to new_value
@@ -263,16 +422,6 @@ def atlas(self, atlas):
             raise TypeError("Cannot set atlas to container a non-Atlas object")
         self._set_and_invalidate_properties("atlas", atlas, ["atlas_df", "data"])
 
-    @property
-    def groups(self):
-        """groups getter"""
-        return self._groups
-
-    @groups.setter
-    def groups(self, groups):
-        """groups setter, invalidate data"""
-        self._set_and_invalidate_properties("groups", groups, ["data"])
-
     @property
     def polarity(self):
         """
@@ -373,11 +522,10 @@ def set_data(self, ids, value):
     def rts(self):
         """
         Allow Rt_Reference objects to be accessed
-        Because this returns a list, the return value is mutable add will modify
-        data internal to this class, but atlas_df and data would get out of sync with atlas.
-        So don't modify the values returned by this property!
+        Returns cloned RtReference objects, so modifing them will not impact data in this class.
+        use set_rt() if you want to modify the RT values held by this class.
         """
-        return [cid.rt_references[0] for cid in self.atlas.compound_identifications]
+        return tuple(cid.rt_references[0].clone() for cid in self.atlas.compound_identifications)
 
     def set_rt(self, compound_idx, which, time):
         """
@@ -417,6 +565,169 @@ def compound_indices_marked_remove(self):
         ids = ["identification", "ms1_notes"]
         return [i for i, j in enumerate(self.data[0]) if _is_remove(ma_data.extract(j, ids))]
 
+    @property
+    def lcmsruns(self):
+        """Get LCMS runs from DB matching experiment"""
+        if self._runs_valid:
+            return self._runs
+        self._runs = dp.get_metatlas_files(experiment=self.ids.experiment, name="%")
+        self._runs_valid = True
+        logger.info("Number of LCMS output files matching '%s' is: %d.", self.ids.experiment, len(self._runs))
+        return self._runs
+
+    @property
+    def existing_groups(self):
+        """Get your own groups that are prefixed by self.experiment"""
+        return metob.retrieve("Groups", name=f"{self.ids.experiment}%", username=self.ids.username)
+
+    @property
+    def lcmsruns_dataframe(self):
+        """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
+        return metob.to_dataframe(self.lcmsruns)
+
+    def get_lcmsruns_short_names(self, fields=None):
+        """
+        Querys DB for lcms filenames from self.experiment and returns
+        a pandas DataFrame containing identifiers for each file
+        inputs:
+            fields: optional dict with column names as key
+                    and list of lcms filename metadata fields positions as value
+        """
+        if fields is None:
+            fields = {
+                "full_filename": range(16),
+                "sample_treatment": [12],
+                "short_filename": [0, 2, 4, 5, 7, 9, 14],
+                "short_samplename": [9, 12, 13, 14],
+            }
+        out = pd.DataFrame(columns=fields.keys())
+        for i, lcms_file in enumerate(self.lcmsruns):
+            tokens = lcms_file.name.split(".")[0].split("_")
+            for name, idxs in fields.items():
+                out.loc[i, name] = "_".join([tokens[n] for n in idxs])
+            out.loc[i, "last_modified"] = pd.to_datetime(lcms_file.last_modified, unit="s")
+        out.sort_values(by="last_modified", inplace=True)
+        out.drop(columns=["last_modified"], inplace=True)
+        out.drop_duplicates(subset=["full_filename"], keep="last", inplace=True)
+        out.set_index("full_filename", inplace=True)
+        return out
+
+    lcmsruns_short_names = property(get_lcmsruns_short_names)
+
+    def group_name(self, base_filename):
+        """Returns dict with keys group and short_name corresponding to base_filename"""
+        indices = [
+            i for i, s in enumerate(self._groups_controlled_vocab) if s.lower() in base_filename.lower()
+        ]
+        tokens = base_filename.split("_")
+        prefix = "_".join(tokens[:11])
+        suffix = self._groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
+        group_name = f"{prefix}_{self.ids.analysis}_{suffix}"
+        short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
+        return {"group": group_name, "short_name": short_name}
+
+    @property
+    def _files_dict(self):
+        """
+        Queries DB for all lcmsruns matching the class properties.
+        Returns a dict of dicts where keys are filenames minus extensions and values are
+        dicts with keys: object, group, and short_name
+        """
+        file_dict = {}
+        for lcms_file in self.lcmsruns:
+            if not any(map(lcms_file.name.__contains__, self._exclude_files)):
+                base_name = lcms_file.name.split(".")[0]
+                file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
+        return file_dict
+
+    @property
+    def groups_dataframe(self):
+        """Returns pandas Dataframe with one group per row"""
+        out = pd.DataFrame(self._files_dict).T
+        out.drop(columns=["object"], inplace=True)
+        out.index.name = "filename"
+        return out.reset_index()
+
+    @property
+    def groups(self):
+        """Returns a list of Group objects"""
+        file_dict = self._files_dict
+        out = []
+        for values in self.groups_dataframe.to_dict("index").values():
+            out.append(
+                metob.Group(
+                    name=values["group"],
+                    short_name=values["short_name"],
+                    items=[
+                        file_value["object"]
+                        for file_value in file_dict.values()
+                        if file_value["group"] == values["group"]
+                    ],
+                )
+            )
+        return out
+
+    def store_groups(self, exist_ok=False):
+        """
+        Save self.object_list to DB
+        inputs:
+            exist_ok: if False, store nothing and raise ValueError if any of the group names
+                      have already been saved to the DB by you.
+        """
+        if not exist_ok:
+            db_names = {group.name for group in self.existing_groups}
+            new_names = set(self.groups_dataframe["group"].to_list())
+            overlap = db_names.intersection(new_names)
+            if overlap:
+                logging.error(
+                    "Not saving groups as you have already saved groups with these names: %s.",
+                    ", ".join(overlap),
+                )
+                raise ValueError("Existing group has same name.")
+        metob.store(self.groups)
+
+    def compound_idxs_not_evaluated(self):
+        """NOT YET IMPLEMENTED"""
+        for compound_idx, _ in enumerate(self.data[0]):
+            print(compound_idx)
+        return []
+
+    def annotation_gui(self, compound_idx=0, width=15, height=3, alpha=0.5, colors=""):
+        """
+        Opens the interactive GUI for setting RT bounds and annotating peaks
+        inputs:
+            compound_idx: number of compound-adduct pair to start at
+            width: width of interface in inches
+            height: height of each plot in inches
+            alpha: (0-1] controls transparency of lines on EIC plot
+            colors: list (color_id, search_string) for coloring lines on EIC plot
+                    based on search_string occuring in LCMS run filename
+        """
+        return dp.adjust_rt_for_selected_compound(
+            self,
+            msms_hits=self.hits,
+            color_me=colors,
+            compound_idx=compound_idx,
+            alpha=alpha,
+            width=width,
+            height=height,
+        )
+
+    def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
+        """
+        Generates the default set of outputs for a targeted experiment
+        inputs:
+            msms_fragment_ions: if True, generate msms fragment ions report
+            overwrite: if False, throw error if any output files already exist
+        """
+        targeted_output.write_atlas_to_spreadsheet(self, overwrite)
+        targeted_output.write_stats_table(self, overwrite)
+        targeted_output.write_chromatograms(self, overwrite)
+        targeted_output.write_identification_figure(self, overwrite)
+        targeted_output.write_metrics_and_boxplots(self, overwrite)
+        if msms_fragment_ions:
+            targeted_output.write_msms_fragment_ions(self, overwrite)
+
 
 class MetatlasSample:
     """
diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index a2e9a8dc..2009a62d 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -7,6 +7,7 @@
 import copy
 import tables
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import write_utils
 import pandas as pd
 from textwrap import wrap
 import matplotlib.pyplot as plt
@@ -205,20 +206,20 @@ def extract(data, ids, default=None):
     as: ('attribute_name',). If you want to make it more explict to the reader, you can add a
     second member to the tuple, which will not be used, such as ('attribute_name', 'as attribute')
     """
-    current = data
+    if len(ids) == 0:
+        return data
     try:
-        for i in ids:
-            if isinstance(i, tuple):
-                current = getattr(current, i[0])
-            elif isinstance(current, list):
-                current = current[i]
-            elif isinstance(current, dict) and i in current.keys():
-                current = current[i]
-            else:
-                current = getattr(current, i)
+        if isinstance(ids[0], tuple):
+            sub_data = getattr(data, ids[0][0])
+        else:
+            try:
+                sub_data = data[ids[0]]
+            except TypeError:
+                sub_data = getattr(data, ids[0])
     except (AttributeError, IndexError, KeyError):
         return default
-    return current
+    else:
+        return extract(sub_data, ids[1:], default)
 
 
 def set_nested(data, ids, value):
@@ -874,33 +875,36 @@ def get_compound_names(data,use_labels=False):
 
     return (compound_names, compound_objects)
 
-def make_data_sources_tables(groups, myatlas, output_loc, polarity=None):
+
+def make_data_sources_tables(groups, myatlas, output_loc, polarity=None, overwrite=True):
     """
     polarity must be one of None, 'POS', 'NEG' or will throw ValueError
     """
-    if polarity and not polarity in ['POS', 'NEG']:
-        raise ValueError
-    prefix = polarity + '_' if polarity else ''
-    if not os.path.exists(output_loc):
-        os.mkdir(output_loc)
-    output_dir = os.path.join(output_loc,'data_sources')
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
-
-    metob.to_dataframe([myatlas]).to_csv(os.path.join(output_dir, prefix+'atlas_metadata.tab'), sep='\t')
-    metob.to_dataframe(groups).to_csv(os.path.join(output_dir, prefix+'groups_metadata.tab'), sep='\t')
+    if polarity not in [None, 'POS', 'NEG']:
+        raise ValueError("Polarity parameter must be one of None, 'POS', or 'NEG'.")
+    prefix = f"{polarity}_" if polarity else ""
+    output_dir = os.path.join(output_loc, f"{prefix}data_sources")
+    atlas_path = os.path.join(output_dir, f"{prefix}atlas_metadata.tab")
+    write_utils.export_dataframe(metob.to_dataframe([myatlas]), atlas_path, "atlas metadata",
+                                 overwrite, sep='\t')
+    groups_path = os.path.join(output_dir, f"{prefix}groups_metadata.tab")
+    write_utils.export_dataframe(metob.to_dataframe(groups), groups_path, "groups metadata",
+                                 overwrite, sep='\t')
 
     atlas_df = make_atlas_df(myatlas)
     atlas_df['label'] = [cid.name for cid in myatlas.compound_identifications]
-    atlas_df.to_csv(os.path.join(output_dir,prefix+myatlas.name+'_originalatlas.tab'), sep='\t')
+    atlas_df_path = os.path.join(output_dir, myatlas.name+'_originalatlas.tab')
+    write_utils.export_dataframe(atlas_df, atlas_df_path, "atlas dataframe", overwrite, sep='\t')
 
-    group_path_df = pd.DataFrame(columns=['group_name','group_path','file_name'])
+    group_path_df = pd.DataFrame(columns=['group_name', 'group_path', 'file_name'])
     loc_counter = 0
-    for g in groups:
-        for f in g.items:
-            group_path_df.loc[loc_counter, 'group_name'] = g.name
-            group_path_df.loc[loc_counter, 'group_path'] = os.path.dirname(f.mzml_file)
-            group_path_df.loc[loc_counter, 'file_name'] = f.mzml_file
+    for group in groups:
+        for run in group.items:
+            group_path_df.loc[loc_counter, 'group_name'] = group.name
+            group_path_df.loc[loc_counter, 'group_path'] = os.path.dirname(run.mzml_file)
+            group_path_df.loc[loc_counter, 'file_name'] = run.mzml_file
             loc_counter += 1
 
-    group_path_df.to_csv(os.path.join(output_dir,prefix+'groups.tab'), sep='\t', index=False)
+    group_path_path = os.path.join(output_dir, f"{prefix}groups.tab")
+    write_utils.export_dataframe(group_path_df, group_path_path, "group-file mapping",
+                                 overwrite, sep='\t', index=False)
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
new file mode 100644
index 00000000..322cc3ad
--- /dev/null
+++ b/metatlas/io/targeted_output.py
@@ -0,0 +1,275 @@
+"""Generate standarized outputs for targeted analysis"""
+# pylint: disable=too-many-arguments
+
+import logging
+import os
+
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+
+from metatlas.io import write_utils
+from metatlas.plots import dill2plots as dp
+from metatlas.tools import fastanalysis as fa
+
+logger = logging.getLogger(__name__)
+
+
+def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
+    """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
+    export_atlas_filename = os.path.join(
+        metatlas_dataset.ids.output_dir,
+        f"{metatlas_dataset.ids.short_polarity}_{metatlas_dataset.atlas.name}_export",
+    )
+    write_utils.check_existing_file(export_atlas_filename, overwrite)
+    dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)
+    logger.info("Exported atlas to file: %s.", export_atlas_filename)
+
+
+def write_stats_table(
+    metatlas_dataset,
+    min_intensity=1e4,
+    rt_tolerance=0.5,
+    mz_tolerance=20,
+    min_msms_score=0.6,
+    min_num_frag_matches=1,
+    min_relative_frag_intensity=0.001,
+    allow_no_msms=True,
+    overwrite=False,
+):
+    """
+    inputs:
+        metatlas_dataset: a MetatlasDataset instance
+        min_intensity: intensity threshold; 1e5 is strict, 1e3 is loose
+        rt_tolerance: RT tolerance threshold
+                      shift of median RT across all files for given compound to reference
+        mz_tolerance: MZ tolerance threshold
+                      ppm of median mz across all files for given compound relative to reference
+                      5 is strict, 25 is loose
+        min_msms_score: score threshold
+                        max dot-product score across all files for given compound relative to reference
+                        Score values in [0-1]; 0.6 is strict, 0.3 is loose
+        min_num_frag_matches: threshold of number of frag matches between sample and reference
+        min_relative_frag_intensity: threshold ratio of second highest to first highest intensity
+                                     of matching sample mzs
+        allow_no_msms: if True evaluate only on MS1 thresholds if no MSMS data is found,
+                       if False filter out row if MSMS thresholds are not passing
+        overwrite: if True, will write over existing files
+    """
+    scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)
+    scores_df["passing"] = fa.test_scores_df(
+        scores_df,
+        min_intensity,
+        rt_tolerance,
+        mz_tolerance,
+        min_msms_score,
+        allow_no_msms,
+        min_num_frag_matches,
+        min_relative_frag_intensity,
+    )
+    prefix = f"{metatlas_dataset.ids.short_polarity}_"
+    scores_path = os.path.join(
+        metatlas_dataset.ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv"
+    )
+    write_utils.export_dataframe(scores_df, scores_path, "scores", overwrite)
+    fa.make_stats_table(
+        input_dataset=metatlas_dataset,
+        msms_hits=metatlas_dataset.hits,
+        output_loc=metatlas_dataset.ids.output_dir,
+        output_sheetname="Draft_Final_Identifications.xlsx",
+        min_peak_height=1e5,
+        use_labels=True,
+        min_msms_score=0.01,
+        min_num_frag_matches=1,
+        include_lcmsruns=[],
+        exclude_lcmsruns=["QC"],
+        polarity=metatlas_dataset.ids.short_polarity,
+        overwrite=overwrite,
+    )
+
+
+def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwrite=False):
+    """
+    inputs:
+        metatlas_dataset: a MetatlasDataset instance
+        group_by: 'index', 'page', or None for grouping of plots
+        share_y: use a common y-axis scaling
+        overwrite: if False raise error if file already exists
+    """
+    # logging and overwrite checks done within dp.make_chromatograms
+    dp.make_chromatograms(
+        input_dataset=metatlas_dataset,
+        include_lcmsruns=[],
+        exclude_lcmsruns=["InjBl", "QC", "Blank", "blank"],
+        group=group_by,
+        share_y=share_y,
+        save=True,
+        output_loc=metatlas_dataset.ids.output_dir,
+        short_names_df=metatlas_dataset.lcmsruns_short_names,
+        short_names_header="short_samplename",
+        polarity=metatlas_dataset.ids.short_polarity,
+        overwrite=overwrite,
+    )
+
+
+def write_identification_figure(metatlas_dataset, overwrite=False):
+    """Save identificatoin figure. Will not overwrite existing file unless overwrite is True"""
+    # logging and overwrite checks done within dp.make_identification_figure_v2
+    dp.make_identification_figure_v2(
+        input_dataset=metatlas_dataset,
+        msms_hits=metatlas_dataset.hits,
+        use_labels=True,
+        include_lcmsruns=[],
+        exclude_lcmsruns=["InjBl", "QC", "Blank", "blank"],
+        output_loc=metatlas_dataset.ids.output_dir,
+        short_names_df=metatlas_dataset.lcmsruns_short_names,
+        polarity=metatlas_dataset.ids.short_polarity,
+        overwrite=overwrite,
+    )
+
+
+def write_metrics_and_boxplots(metatlas_dataset, overwrite=False):
+    """
+    Save metrics dataframes as csv and boxplots as PDF.
+    Will not overwrite existing file unless overwrite is True
+    """
+    config = [
+        {"name": "peak_height", "label": "Peak Height"},
+        {"name": "peak_area", "label": None},
+        {"name": "mz_peak", "label": None},
+        {"name": "rt_peak", "label": "RT Peak"},
+        {"name": "mz_centroid", "label": "MZ Centroid"},
+        {"name": "rt_centroid", "label": None},
+    ]
+    prefix = f"{metatlas_dataset.ids.short_polarity}_"
+    for fields in config:
+        df_dir = os.path.join(metatlas_dataset.ids.output_dir, f"{prefix}data_sheets")
+        dataframe = dp.make_output_dataframe(
+            fieldname=fields["name"],
+            input_dataset=metatlas_dataset,
+            output_loc=df_dir,
+            short_names_df=metatlas_dataset.lcmsruns_short_names,
+            polarity=metatlas_dataset.ids.short_polarity,
+            use_labels=True,
+            overwrite=overwrite,
+        )
+        if fields["label"] is not None:
+            plot_dir = os.path.join(
+                metatlas_dataset.ids.output_dir,
+                f"{prefix}boxplot_{fields['name']}",
+            )
+            dp.make_boxplot_plots(dataframe, output_loc=plot_dir, ylabel=fields["label"], overwrite=overwrite)
+
+
+Max = namedtuple("Max", ["file_idx", "pre_intensity_idx", "pre_intensity", "precursor_mz"])
+
+
+def write_msms_fragment_ions(
+    data, intensity_fraction=0.01, min_mz=450, max_mz_offset=5, scale_intensity=1e5, overwrite=False
+):
+    """
+    inputs:
+        data: metatlas_datset
+        intensity_fraction: intensity threshold as fraction of max_msms_intensity (0-1]
+        min_mz: minimum threshold MSMS mz value
+        max_mz: maximum threshold MSMS mz value. Relative to precursor mz with highest intensity
+        scale_intensity: If not None, normalize output intensity to maximum of scale_intensity
+    """
+    out = []
+    for compound_idx, _ in enumerate(data[0]):
+        max_vars = get_max_precursor_intensity(data, compound_idx)
+        out.append(
+            get_spectra_strings(
+                data[max_vars.file_idx][compound_idx],
+                max_vars.pre_intensity,
+                min_mz,
+                max_mz_offset + max_vars.precursor_mz,
+                intensity_fraction,
+                scale_intensity,
+            )
+        )
+    out_df = pd.DataFrame(out)
+    path = os.path.join(data.ids.output_dir, f"spectra_{intensity_fraction:.2f}pct_{int(min_mz)}cut.csv")
+    write_utils.export_dataframe(out_df, path, "MSMS fragment ions", overwrite)
+    return out_df
+
+
+def get_max_precursor_intensity(data, compound_idx):
+    """
+    inputs:
+        data: metatlas_dataset
+        compound_idx: index of compound to search over
+    returns Max object with file index of highest precursor intensity, associated intensity value, and mz
+    """
+    max_pre_intensity = max_precursor_mz = 0
+    max_file_idx = max_pre_intensity_idx = None
+    for file_idx, _ in enumerate(data):
+        try:
+            msms = data[file_idx][compound_idx]["data"]["msms"]["data"]
+            if len(msms["precursor_intensity"]) == 0:
+                continue
+            pre_intensity_idx = msms["precursor_intensity"].argmax()
+            pre_intensity = msms["precursor_intensity"][pre_intensity_idx]
+            precursor_mz = msms["precursor_MZ"][pre_intensity_idx]
+            rts = msms["rt"][pre_intensity_idx]
+            rt_ref = data[file_idx][compound_idx]["identification"].rt_references[-1]
+            if pre_intensity > max_pre_intensity and rt_ref.rt_min < rts < rt_ref.rt_max:
+                max_file_idx = file_idx
+                max_pre_intensity_idx = pre_intensity_idx
+                max_pre_intensity = pre_intensity
+                max_precursor_mz = precursor_mz
+        except (AttributeError, IndexError):
+            pass
+    return Max(max_file_idx, max_pre_intensity_idx, max_pre_intensity, max_precursor_mz)
+
+
+def get_spectra_strings(data, max_pre_intensity, min_mz, max_mz, intensity_fraction, scale_intensity):
+    """
+    inputs:
+        data: metatlas_dataset[x][y]
+        sample_idx: first index into data
+        compound_idx: second into into data
+        max_pre_intensity: highest msms precursor intensity for this compound across all samples
+        min_mz: minimum threshold MSMS mz value
+        max_mz: maximum threshold MSMS mz value
+        intensity_fraction: intensity threshold as fraction of max_msms_intensity (0-1]
+        scale_intensity: If not None, normalize output intensity to maximum of scale_intensity
+    returns a dict containing compound name and string representations of the spectra
+    """
+    mz_list, intensity_list = get_spectra(
+        data, max_pre_intensity, min_mz, max_mz, intensity_fraction, scale_intensity
+    )
+    mz_str = str(["%.2f" % x for x in mz_list]).replace("'", "")
+    intensity_str = str(["%d" % x for x in intensity_list]).replace("'", "")
+    spectra_str = str([mz_str, intensity_str]).replace("'", "")
+    name = data["identification"].name
+    return {"name": name, "spectrum": spectra_str, "mz": mz_str, "intensity": intensity_str}
+
+
+def get_spectra(data, max_pre_intensity, min_mz, max_mz, intensity_fraction, scale_intensity):
+    """
+    inputs:
+        data: metatlas_dataset[i][j]
+        max_pre_intensity: highest msms precursor intensity for this compound across all samples
+        min_mz: minimum threshold MSMS mz value
+        max_mz: maximum threshold MSMS mz value
+        intensity_fraction: intensity threshold as fraction of max_msms_intensity (0-1]
+        scale_intensity: If not None, normalize output intensity to maximum of scale_intensity
+    returns a tuple containing a list of mz values and a list intensity values that make a spectra
+    returns None, None if no spectra meet the filtering thresholds
+    """
+    if max_pre_intensity != 0:
+        msms = data["data"]["msms"]["data"]
+        idx = np.argwhere(msms["precursor_intensity"] == max_pre_intensity).flatten()
+        msms_mz = msms["mz"][idx]
+        intensity = msms["i"][idx]
+        max_msms_intensity = intensity.max()
+        cutoff = intensity_fraction * max_msms_intensity
+        conditions = (intensity > cutoff) & (min_mz < msms_mz) & (msms_mz < max_mz)
+        if any(conditions):
+            keep_idx = np.argwhere(conditions).flatten()
+            if scale_intensity is not None:
+                intensity = (intensity / max_msms_intensity * scale_intensity).astype(int)
+            return msms_mz[keep_idx], intensity[keep_idx]
+    return None, None
diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
new file mode 100644
index 00000000..06e6c7b8
--- /dev/null
+++ b/metatlas/io/write_utils.py
@@ -0,0 +1,80 @@
+""" Utility functions used in writing files"""
+
+import logging
+import os
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def make_dir_for(file_path):
+    """makes directories for file_path if they don't already exist"""
+    directory = os.path.dirname(file_path)
+    if directory != "":
+        os.makedirs(directory, exist_ok=True)
+
+
+def check_existing_file(file_path, overwrite=False):
+    """Creates directories as needed and throws an error if file exists and overwrite is False"""
+    make_dir_for(file_path)
+    try:
+        if not overwrite and os.path.exists(file_path):
+            raise FileExistsError(f"Not overwriting {file_path}.")
+    except FileExistsError as err:
+        logger.exception(err)
+        raise
+
+
+def export_dataframe(dataframe, file_path, description, overwrite=False, **kwargs):
+    """
+    inputs:
+        dataframe: pandas DataFrame to save
+        file_path: string with path of file to create
+        description: free string for logging
+        overwrite: if False, raise error if file already exists
+        remaining arguments are passed through to to_csv()
+    """
+    check_existing_file(file_path, overwrite)
+    dataframe.to_csv(file_path, **kwargs)
+    logger.info("Exported %s to %s.", description, file_path)
+
+
+def raise_on_diff(dataframe, file_path, description, **kwargs):
+    """
+    inputs:
+        dataframe: pandas DataFrame to save
+        file_path: string with path of file to compare against
+        description: free string for logging
+        remaining arguments are passed through to read_csv()
+
+    If file_path exists and does not match dataframe then raise ValueError
+    """
+    if not os.path.exists(file_path):
+        return
+    existing_df = pd.read_csv(file_path, **kwargs)
+    if dataframe.equals(existing_df):
+        logging.info("Data in %s is the same as %s.", description, file_path)
+    else:
+        try:
+            raise ValueError("Data in %s is not the same as %s." % (description, file_path))
+        except ValueError as err:
+            logger.exception(err)
+            raise
+
+
+def export_dataframe_die_on_diff(dataframe, file_path, description, **kwargs):
+    """
+    inputs:
+        dataframe: pandas DataFrame to save
+        file_path: string with path of file to create
+        description: free string for logging
+        remaining arguments are passed through to to_csv()
+
+    If file_path does not exist then save the dataframe there
+    If file_path exists and matches data in dataframe then do nothing
+    If file_path exists and does not match dataframe then raise ValueError
+    """
+    raise_on_diff(dataframe, file_path, description, **kwargs)
+    if not os.path.exists(file_path):
+        export_dataframe(dataframe, file_path, description, **kwargs)
diff --git a/metatlas/plots/chromplotplus.py b/metatlas/plots/chromplotplus.py
index 77c64431..927991ea 100644
--- a/metatlas/plots/chromplotplus.py
+++ b/metatlas/plots/chromplotplus.py
@@ -1,14 +1,20 @@
-from __future__ import absolute_import
+import logging
+from textwrap import wrap
+
 from matplotlib import pyplot as plt
 from matplotlib.backends.backend_pdf import PdfPages
 
 from matplotlib import collections as mc
 import numpy as np
 from scipy.interpolate import interp1d
-from textwrap import wrap
 from six.moves import range
 from six.moves import zip
 
+from metatlas.io import write_utils
+
+logger = logging.getLogger(__name__)
+
+
 def chromplotplus(kwargs):
     ChromPlotPlus(**kwargs)
 
@@ -39,6 +45,7 @@ def __init__(self, data, shortname,
                  x_scale = .8, y_scale = .75,
                  x_ratio = 13.0, y_ratio=11.0,
                  num_x_hashes=4, num_y_hashes=4,
+                 overwrite=False,
                  **kwargs):
 
         assert len(data) > 0
@@ -78,6 +85,7 @@ def __init__(self, data, shortname,
         self.y_ratio = y_ratio
         self.num_x_hashes = num_x_hashes
         self.num_y_hashes = num_y_hashes
+        self.overwrite = overwrite
 
         plt.ioff()
 
@@ -218,13 +226,14 @@ def __make_figure(self):
                     self.ax.annotate(self.compound_eics[i],
                                      subplot_xy[0], ha='center', va='center', size = 100./(num_cols+.25), weight='bold')
 
+        write_utils.check_existing_file(self.file_name, self.overwrite)
         with PdfPages(self.file_name) as pdf:
             plt.rcParams['pdf.fonttype'] = 42
             plt.rcParams['pdf.use14corefonts'] = True
             plt.rcParams['text.usetex'] = False
             pdf.savefig(self.fig)
             plt.close()
-
+            logger.info("Exported chromatogram to %s.", self.file_name)
 
     @staticmethod
     def __yield_label():
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index e6223207..970d443b 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -5,12 +5,14 @@
 import os
 import os.path
 import multiprocessing as mp
+import warnings
 # os.environ['R_LIBS_USER'] = '/project/projectdirs/metatlas/r_pkgs/'
 # curr_ld_lib_path = ''
 
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.io import write_utils
 from metatlas.tools import spectralprocessing as sp
 from metatlas.plots import chromplotplus as cpp
 from metatlas.io.metatlas_get_data_helper_fun import extract
@@ -277,7 +279,7 @@ def __init__(self,
 
         self.file_names = ma_data.get_file_names(self.data)
         self.configure_flags()
-        self.filter_runs(include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
+        self.data = filter_runs(self.data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
         self.similar_rects = []
         # only the atlas owner can change RT limits or flags
@@ -424,21 +426,6 @@ def display_eic_data(self):
                     self.ax.plot(x, y, '-', zorder=zorder, linewidth=2, alpha=self.alpha,
                                  picker=True, pickradius=5, color=color, label=label)
 
-    def filter_runs(self, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups):
-        # filter runs from the metatlas dataset
-        if include_lcmsruns:
-            self.data = filter_lcmsruns_in_dataset_by_include_list(self.data, 'lcmsrun',
-                                                                   include_lcmsruns)
-        if include_groups:
-            self.data = filter_lcmsruns_in_dataset_by_include_list(self.data, 'group',
-                                                                   include_groups)
-        if exclude_lcmsruns:
-            self.data = filter_lcmsruns_in_dataset_by_exclude_list(self.data, 'lcmsrun',
-                                                                   exclude_lcmsruns)
-        if exclude_groups:
-            self.data = filter_lcmsruns_in_dataset_by_exclude_list(self.data, 'group',
-                                                                   exclude_groups)
-
     def configure_flags(self):
         default_peak = ('keep', 'remove', 'unresolvable isomers', 'poor peak shape')
         default_msms = ('no selection',
@@ -801,18 +788,7 @@ def __init__(self,
         self.slider_color = slider_color
         self.y_max = y_max
         self.y_min = y_min
-
-        # filter runs from the metatlas dataset
-        if include_lcmsruns:
-            data = filter_lcmsruns_in_dataset_by_include_list(data,'lcmsrun',include_lcmsruns)
-
-        if include_groups:
-            data = filter_lcmsruns_in_dataset_by_include_list(data,'group',include_groups)
-        if exclude_lcmsruns:
-            data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_lcmsruns)
-        if exclude_groups:
-            data = filter_lcmsruns_in_dataset_by_exclude_list(data,'group',exclude_groups)
-        self.data = data
+        self.data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
         # create figure and first axes
         self.fig,self.ax = plt.subplots(figsize=(width, height))
@@ -1042,25 +1018,14 @@ def plot_all_compounds_for_each_file(input_dataset = [], input_fname = '', inclu
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
-
-    # filter runs from the metatlas dataset
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'lcmsrun',include_lcmsruns)
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'group',include_lcmsruns)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_lcmsruns)
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'group',exclude_lcmsruns)
-
+    data = filter_runs(data, include_lcmsruns, include_lcmsruns, exclude_lcmsruns, exclude_lcmsruns)
     compound_names = ma_data.get_compound_names(data)[0]
     file_names = ma_data.get_file_names(data)
 
-
     output_loc = os.path.expandvars('output_loc')
 
     nRows = int(np.ceil(len(compound_names)/float(nCols)))
 
-
     xmin = 0
     xmax = 210
     subrange = float(xmax-xmin)/float(nCols) # scale factor for the x-axis
@@ -1158,15 +1123,7 @@ def plot_all_files_for_each_compound(input_dataset = [], input_fname = '', inclu
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
-
-    # filter runs from the metatlas dataset
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'lcmsrun',include_lcmsruns)
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'group',include_lcmsruns)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_lcmsruns)
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'group',exclude_lcmsruns)
+    data = filter_runs(data, include_lcmsruns, include_lcmsruns, exclude_lcmsruns, exclude_lcmsruns)
 
     compound_names = ma_data.get_compound_names(data)[0]
     file_names = ma_data.get_file_names(data)
@@ -1393,21 +1350,8 @@ def get_ion_from_fragment(frag_info,spectrum):
 
 
 def calculate_median_of_internal_standards(dataset_for_median,atlas,include_lcmsruns = [],exclude_lcmsruns = [], include_groups = [],exclude_groups = []):
-    """
-
-    """
-
-    # filter runs from the metatlas dataset
-
-    # dataset_for_median = copy.deepcopy(dataset_for_median)
-    if include_lcmsruns:
-        dataset_for_median = filter_lcmsruns_in_dataset_by_include_list(dataset_for_median,'lcmsrun',include_lcmsruns)
-    if include_groups:
-        dataset_for_median = filter_lcmsruns_in_dataset_by_include_list(dataset_for_median,'group',include_groups)
-    if exclude_lcmsruns:
-        dataset_for_median = filter_lcmsruns_in_dataset_by_exclude_list(dataset_for_median,'lcmsrun',exclude_lcmsruns)
-    if exclude_groups:
-        dataset_for_median = filter_lcmsruns_in_dataset_by_exclude_list(dataset_for_median,'group',exclude_groups)
+    dataset_for_median = filter_runs(dataset_for_median, include_lcmsruns, include_groups,
+                                     exclude_lcmsruns, exclude_groups)
     internal_standard_vals = []
     for i,dd in enumerate(dataset_for_median): #loop through files
         for j,d in enumerate(dd): #loop through compounds
@@ -1478,7 +1422,21 @@ def normalize_peaks_by_internal_standard(metatlas_dataset,atlas,include_lcmsruns
 #print all chromatograms
 #structure
 
-def make_output_dataframe(input_fname = '',input_dataset = [],include_lcmsruns = [],exclude_lcmsruns = [], include_groups = [],exclude_groups = [], output_loc = [], fieldname = 'peak_height', use_labels=False, short_names_df=pd.DataFrame(), summarize=False, polarity=''):
+
+def filter_runs(data, include_lcmsruns=None, include_groups=None, exclude_lcmsruns=None, exclude_groups=None):
+    """filter runs from the metatlas dataset"""
+    if include_lcmsruns:
+        data = filter_lcmsruns_in_dataset_by_include_list(data, 'lcmsrun', include_lcmsruns)
+    if include_groups:
+        data = filter_lcmsruns_in_dataset_by_include_list(data, 'group', include_groups)
+    if exclude_lcmsruns:
+        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'lcmsrun', exclude_lcmsruns)
+    if exclude_groups:
+        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'group', exclude_groups)
+    return data
+
+
+def make_output_dataframe(input_fname='', input_dataset=None, include_lcmsruns=None, exclude_lcmsruns=None, include_groups=None, exclude_groups=None, output_loc="", fieldname='peak_height', use_labels=False, short_names_df=None, summarize=False, polarity='', overwrite=True):
     """
     fieldname can be: peak_height, peak_area, mz_centroid, rt_centroid, mz_peak, rt_peak
     """
@@ -1486,73 +1444,55 @@ def make_output_dataframe(input_fname = '',input_dataset = [],include_lcmsruns =
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
+    data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
-    # filter runs from the metatlas dataset
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'lcmsrun',include_lcmsruns)
-    if include_groups:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'group',include_groups)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_lcmsruns)
-    if exclude_groups:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'group',exclude_groups)
-
-    compound_names = ma_data.get_compound_names(data,use_labels=use_labels)[0]
+    compound_names = ma_data.get_compound_names(data, use_labels=use_labels)[0]
     file_names = ma_data.get_file_names(data)
     group_names = ma_data.get_group_names(data)
     group_shortnames = ma_data.get_group_shortnames(data)
     output_loc = os.path.expandvars(output_loc)
-    fieldname = fieldname
+    out = pd.DataFrame(index=compound_names, columns=file_names, dtype=float)
 
-    df = pd.DataFrame( index=compound_names, columns=file_names, dtype=float)
-
-    # peak_height['compound'] = compound_list
-    # peak_height.set_index('compound',drop=True)
-    for i,dd in enumerate(data):
-        for j,d in enumerate(dd):
-            if (not d['data']['ms1_summary']) or (not d['data']['ms1_summary'][fieldname]):
-                df.loc[compound_names[j],file_names[i]] = 0
-            else:
-                df.loc[compound_names[j],file_names[i]] = d['data']['ms1_summary'][fieldname]
+    for i, sample in enumerate(data):
+        for j, compound in enumerate(sample):
+            ids = ['data', 'ms1_summary', fieldname]
+            out.loc[compound_names[j], file_names[i]] = ma_data.extract(compound, ids, 0)
     columns = []
-    if short_names_df.empty:
-        for i,f in enumerate(file_names):
-            columns.append((group_names[i],f))
-        df.columns = pd.MultiIndex.from_tuples(columns,names=['group', 'file'])
+    if short_names_df is None:
+        short_names_df = pd.DataFrame()
+        for i, name in enumerate(file_names):
+            columns.append((group_names[i], name))
+        out.columns = pd.MultiIndex.from_tuples(columns, names=['group', 'file'])
     else:
-        for i,f in enumerate(file_names):
-            temp = [group_names[i],f, group_shortnames[i]]
-            temp.extend(short_names_df.loc[f.split('.')[0]].values.tolist())
+        for i, name in enumerate(file_names):
+            temp = [group_names[i], name, group_shortnames[i]]
+            temp.extend(short_names_df.loc[name.split('.')[0]].values.tolist())
             columns.append(tuple(temp))
-        df.columns = pd.MultiIndex.from_tuples(columns,names=['group', 'file', 'short groupname', 'sample treatment', 'short filename','short samplename'])
-
+        out.columns = pd.MultiIndex.from_tuples(columns, names=['group', 'file', 'short groupname', 'sample treatment', 'short filename', 'short samplename'])
+    out = out.reindex(sorted(out.columns), axis=1)
+    if summarize:
+        out.columns = out.columns.droplevel()
+        out = append_stats_columns(out)
     if output_loc:
-        if not os.path.exists(output_loc):
-            os.makedirs(output_loc)
-        if polarity == '':
-            df.to_csv(os.path.join(output_loc, fieldname + '.tab'),sep='\t')
-        else:
-            df.to_csv(os.path.join(output_loc, polarity+'_'+fieldname + '.tab'),sep='\t')
+        prefix = f"{polarity}_" if polarity != '' else ''
+        df_path = os.path.join(output_loc, f"{prefix}{fieldname}.tab")
+        write_utils.check_existing_file(df_path, overwrite)
+        out.to_csv(df_path, sep="\t")
+        logger.info('Exported %s to %s.', fieldname, df_path)
+    return out
 
-    if summarize:
-        df.columns = df.columns.droplevel()
-        df_mean = df.mean(numeric_only=True, axis=1)
-        df_median = df.median(numeric_only=True, axis=1)
-        df_min = df.min(numeric_only=True, axis=1)
-        df_max = df.max(numeric_only=True, axis=1)
-        df_std = df.std(numeric_only=True, axis=1)
-        df_sem = df.sem(numeric_only=True, axis=1)
-        df_nan = df.isin(['NaN']).sum(axis=1)
-        df['mean'] = df_mean
-        df['median'] = df_median
-        df['max'] = df_max
-        df['min'] = df_min
-        df['standard deviation'] = df_std
-        df['standard error'] = df_sem
-        df['#NaNs'] = df_nan
 
-    return df
+def append_stats_columns(in_df):
+    stats = pd.DataFrame(dtype=float)
+    stats['mean'] = in_df.mean(numeric_only=True, axis=1)
+    stats['median'] = in_df.median(numeric_only=True, axis=1)
+    stats['min'] = in_df.min(numeric_only=True, axis=1)
+    stats['max'] = in_df.max(numeric_only=True, axis=1)
+    stats['standard deviation'] = in_df.std(numeric_only=True, axis=1)
+    stats['standard error'] = in_df.sem(numeric_only=True, axis=1)
+    stats['#NaNs'] = in_df.isin(['NaN']).sum(axis=1)
+    return pd.concat([in_df, stats], axis=1)
+
 
 def file_with_max_precursor_intensity(data,compound_idx):
     idx = None
@@ -1648,12 +1588,9 @@ def plot_errorbar_plots(df,output_loc='', use_shortnames=True, ylabel=""):
         #f.clear()
         plt.close(f)#f.clear()
 
-def make_boxplot_plots(df,output_loc='', use_shortnames=True, ylabel=""):
 
+def make_boxplot_plots(df, output_loc='', use_shortnames=True, ylabel="", overwrite=True):
     output_loc = os.path.expandvars(output_loc)
-    if not os.path.exists(output_loc):
-        os.makedirs(output_loc)
-
     plt.ioff()
     for compound in df.index:
         f, ax = plt.subplots(1, 1,figsize=(12,12))
@@ -1673,9 +1610,13 @@ def make_boxplot_plots(df,output_loc='', use_shortnames=True, ylabel=""):
         if ylabel != "":
             plt.ylabel(ylabel)
         plt.tight_layout()
-        f.savefig(os.path.join(output_loc, compound + '_boxplot.pdf'))
+        fig_path = os.path.join(output_loc, compound + '_boxplot.pdf')
+        write_utils.check_existing_file(fig_path, overwrite)
+        f.savefig(fig_path)
         #f.clear()
         plt.close(f)#f.clear()
+        logger.info('Exported box plot of %s for %s at %s.', ylabel, compound, fig_path)
+
 
 def frag_refs_to_json(json_dir = '/project/projectdirs/metatlas/projects/sharepoint/', name = 'frag_refs', save = True):
     ids = metob.retrieve('CompoundIdentification',username='*')
@@ -1725,20 +1666,7 @@ def make_identification_figure(frag_json_dir = '/project/projectdirs/metatlas/pr
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
-
-    # filter runs from the metatlas dataset
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'lcmsrun',include_lcmsruns)
-    if include_groups:
-        data = filter_lcmsruns_in_dataset_by_include_list(data,'group',include_groups)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_lcmsruns)
-    if exclude_groups:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data,'lcmsrun',exclude_groups)
-        #data = filter_lcmsruns_in_dataset_by_exclude_list(data,'group',exclude_lcmsruns)
-
-
+    data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
     compound_names = ma_data.get_compound_names(data,use_labels=use_labels)[0]
     file_names = ma_data.get_file_names(data)
     # print(len(data),len(data[0]),len(compound_names))
@@ -2288,19 +2216,10 @@ def get_msms_hits(metatlas_dataset, use_labels=False, extra_time=False, keep_non
         return pd.DataFrame(columns=ref_df.index.names+['file_name', 'msms_scan', 'score', 'num_matches','inchi_key','precursor_mz','adduct','score']
                            ).set_index(ref_df.index.names+['file_name', 'msms_scan'])
 
-def make_chromatograms(input_dataset = [], include_lcmsruns = [], exclude_lcmsruns = [], include_groups = [], exclude_groups = [], group='index', share_y = True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity=''):
-
-    #Filter runs from the metatlas dataset
-    if include_lcmsruns:
-        input_dataset = filter_lcmsruns_in_dataset_by_include_list(input_dataset, 'lcmsrun', include_lcmsruns)
-    if exclude_lcmsruns:
-        input_dataset = filter_lcmsruns_in_dataset_by_exclude_list(input_dataset, 'lcmsrun', exclude_lcmsruns)
-
-    if include_groups:
-        input_dataset = filter_lcmsruns_in_dataset_by_include_list(input_dataset, 'group', include_groups)
-    if exclude_groups:
-        input_dataset = filter_lcmsruns_in_dataset_by_exclude_list(input_dataset, 'group', exclude_groups)
 
+def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], group='index', share_y=True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity='', overwrite=False):
+    input_dataset = filter_runs(input_dataset, include_lcmsruns, include_groups,
+                                exclude_lcmsruns, exclude_groups)
     file_names = ma_data.get_file_names(input_dataset)
 
     if short_names_df.empty:
@@ -2317,196 +2236,123 @@ def make_chromatograms(input_dataset = [], include_lcmsruns = [], exclude_lcmsru
         short_names_df = short_names_df[[short_names_header]]
         short_names_df.columns=['shortname']
 
-
-
-    if not os.path.exists(output_loc):
-        os.makedirs(output_loc)
+    os.makedirs(output_loc, exist_ok=True)
     compound_names = ma_data.get_compound_names(input_dataset,use_labels=True)[0]
     args_list = []
 
-    chromatogram_str = 'compound_EIC_chromatograms'
-    if polarity != '':
-        chromatogram_str = polarity+'_'+chromatogram_str
-
-    if not os.path.exists(os.path.join(output_loc,chromatogram_str)):
-        os.makedirs(os.path.join(output_loc,chromatogram_str))
-
+    prefix = f"{polarity}_" if polarity != '' else ''
+    chromatogram_dir = os.path.join(output_loc, f"{prefix}compound_EIC_chromatograms")
     for compound_idx, my_compound in enumerate(compound_names):
-        my_data = list()
-        for file_idx, my_file in enumerate(file_names):
-            my_data.append(input_dataset[file_idx][compound_idx])
+        my_data = [input_dataset[file_idx][compound_idx] for file_idx, _ in enumerate(file_names)]
         kwargs = {'data': my_data,
-                 'file_name': os.path.join(output_loc, chromatogram_str, my_compound+'.pdf'),
-                 'group': group,
-                 'save': save,
-                 'share_y': share_y,
-                 'names': file_names,
-                 #'shortname':findcommonstart(file_names)}
-                 'shortname':short_names_df}
+                  'file_name': os.path.join(chromatogram_dir, my_compound+'.pdf'),
+                  'group': group,
+                  'save': save,
+                  'share_y': share_y,
+                  'names': file_names,
+                  'shortname': short_names_df,
+                  'overwrite': overwrite}
         args_list.append(kwargs)
     max_processes = 4
-    pool = mp.Pool(processes=min(max_processes, len(input_dataset[0])))
-    pool.map(cpp.chromplotplus, args_list)
-    pool.close()
-    pool.terminate()
-
-def make_identification_figure_v2(
-    input_fname = '', input_dataset = [], include_lcmsruns = [], exclude_lcmsruns = [], include_groups = [],
-    exclude_groups = [], output_loc = [], msms_hits = None, use_labels=False,intensity_sorted_matches=False, short_names_df=pd.DataFrame(), polarity=''):
-    #empty can look like this:
-    # {'eic': {'rt': [], 'intensity': [], 'mz': []}, 'ms1_summary': {'num_ms1_datapoints': 0.0, 'rt_centroid': nan, 'mz_peak': nan, 'peak_height': nan, 'rt_peak': nan, 'peak_area': nan, 'mz_centroid': nan},
-    #'msms': {'data': {'rt': array([], dtype=float64), 'collision_energy': array([], dtype=float64), 'i': array([], dtype=float64), 'precursor_intensity': array([], dtype=float64), 'precursor_MZ': array([], dtype=float64), 'mz': array([], dtype=float64)}}}
-    #or empty can look like this:
-    # {'eic': None, 'ms1_summary': None, 'msms': {'data': []}}
-    if not os.path.exists(output_loc):
-        os.makedirs(output_loc)
-    if polarity == '':
-        output_loc = os.path.join(output_loc,'msms_mirror_plots')
-    else:
-        output_loc = os.path.join(output_loc, polarity+'_msms_mirror_plots')
-    if not os.path.exists(output_loc):
-        os.makedirs(output_loc)
+    with mp.Pool(processes=min(max_processes, len(input_dataset[0]))) as pool:
+        pool.map(cpp.chromplotplus, args_list)
+
+
+def make_identification_figure_v2(input_fname='', input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[],
+                                  include_groups=[], exclude_groups=[], output_loc=[], msms_hits=None,
+                                  use_labels=False, intensity_sorted_matches=False,
+                                  short_names_df=pd.DataFrame(), polarity='', overwrite=True):
+    prefix = '' if polarity == '' else f"{polarity}_"
+    output_loc = os.path.join(output_loc, f"{prefix}msms_mirror_plots")
     if not input_dataset:
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
+    data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
-    #Filter runs from the metatlas dataset
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data, 'lcmsrun', include_lcmsruns)
-    if include_groups:
-        data = filter_lcmsruns_in_dataset_by_include_list(data, 'group', include_groups)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'lcmsrun', exclude_lcmsruns)
-    if exclude_groups:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'group', exclude_groups)
-
-    #msms_hits_df = get_msms_hits(data, use_labels, ref_index=['database', 'id', 'inchi_key', 'precursor_mz'])
-    msms_hits_df = msms_hits.copy()
-
-    if msms_hits_df is not None:
-        #if 'inchi_key' in msms_hits_df.columns:
-        #    msms_hits_df.rename(columns={'inchi_key':'inchi_key_2'},inplace=True)
-        #msms_hits_df.reset_index(['inchi_key', 'precursor_mz'], inplace=True)
-        msms_hits_df.reset_index(inplace = True)
-        msms_hits_df.sort_values('score', ascending=False, inplace=True)
-        # msms_hits_df.drop_duplicates(['inchi_key', 'file_name'], keep='first', inplace=True)
-        # msms_hits_df = msms_hits_df.groupby(['inchi_key']).head(5).sort_values(['inchi_key'], kind='mergesort')
-
-    #Obtain compound and file names
-    compound_names = ma_data.get_compound_names(data,use_labels)[0]
+    if msms_hits is not None:
+        msms_hits_df = msms_hits.reset_index().sort_values('score', ascending=False)
+    compound_names = ma_data.get_compound_names(data, use_labels)[0]
     file_names = ma_data.get_file_names(data)
-
-    df = pd.DataFrame()
-    #Turn off interactive plotting
+    match = pd.DataFrame()
     plt.ioff()
     plt.clf()
-    #Iterate over compounds
-    for compound_idx in range(len(compound_names)):
-        sys.stdout.write('\r'+'Making Identification Figure for: {} / {} compounds.'.format(compound_idx+1,len(compound_names)))
-        sys.stdout.flush()
+    for compound_idx, _ in enumerate(compound_names):
         file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], []
-
-        if len(data[0][compound_idx]['identification'].compound) > 0 and hasattr(data[0][compound_idx]['identification'].compound[0],"inchi_key"):
-            inchi_key = data[0][compound_idx]['identification'].compound[0].inchi_key
-        else:
-            inchi_key = ""
-
-        #Find 5 best file and reference pairs by score
+        inchi_key = extract(data, [0, compound_idx, "identification", "compound", 0, "inchi_key"], "")
+        #  Find 5 best file and reference pairs by score
         try:
-            comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) \
-                                          & (msms_hits_df['msms_scan'] >= data[0][compound_idx]['identification'].rt_references[0].rt_min) \
-                                          & (msms_hits_df['msms_scan'] <= data[0][compound_idx]['identification'].rt_references[0].rt_max) \
-                                          & ((abs(msms_hits_df['precursor_mz'].values.astype(float) - data[0][compound_idx]['identification'].mz_references[0].mz)/data[0][compound_idx]['identification'].mz_references[0].mz) \
-                                             <= data[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)].drop_duplicates('file_name').head(5)
-            # Dont need assert anymore, keep_nonmatch in get_msms_hits should replace the assert
-            #assert len(comp_msms_hits) > 0
+            rt_ref = data[0][compound_idx]['identification'].rt_references[0]
+            mz_ref = data[0][compound_idx]['identification'].mz_references[0]
+            comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key)
+                                          & (msms_hits_df['msms_scan'] >= rt_ref.rt_min)
+                                          & (msms_hits_df['msms_scan'] <= rt_ref.rt_max)
+                                          & within_tolerance(
+                                                msms_hits_df['precursor_mz'].values.astype(float),
+                                                mz_ref.mz,
+                                                mz_ref.mz_tolerance*1e-6
+                                            )
+                                          ].drop_duplicates('file_name').head(5)
             comp_msms_hits = comp_msms_hits[comp_msms_hits['file_name'].isin(file_names)]
             file_idxs = [file_names.index(f) for f in comp_msms_hits['file_name']]
             scores = comp_msms_hits['score'].values.tolist()
             msv_sample_list = comp_msms_hits['msv_query_aligned'].values.tolist()
             msv_ref_list = comp_msms_hits['msv_ref_aligned'].values.tolist()
             rt_list = comp_msms_hits['msms_scan'].values.tolist()
-
-        #except (IndexError, AssertionError, TypeError) as e:
-        except (IndexError, TypeError) as e:
+        except (IndexError, TypeError):
             file_idx = None
             max_intensity = 0
-
-            for fi in range(len(data)):
+            for file_idx, _ in enumerate(data):
                 try:
-                    temp = max(data[fi][compound_idx]['data']['eic']['intensity'])
+                    temp = max(data[file_idx][compound_idx]['data']['eic']['intensity'])
                     if temp > max_intensity:
-                        file_idx = fi
+                        max_file_idx = file_idx
                         max_intensity = temp
-                except (ValueError,TypeError):
+                except (ValueError, TypeError):
                     continue
 
-            file_idxs = [file_idx]
+            file_idxs = [max_file_idx]
             msv_sample_list = [np.array([0, np.nan]).T]
             msv_ref_list = [np.array([0, np.nan]).T]
             scores = [np.nan]
 
-
-        #Plot if compound yields any scores
+        #  Plot if compound yields any scores
         if file_idxs and file_idxs[0] is not None:
-            #Top 5 MSMS Spectra
-            ax1 = plt.subplot2grid((24, 24), (0, 0), rowspan=12, colspan=12)
-            ax2a = plt.subplot2grid((24, 24), (0, 12), rowspan=3, colspan=3)
-            ax2a.tick_params(axis='both', length=2)
-            ax2a.set_xticklabels([])
-            ax2a.set_yticklabels([])
-            ax2b = plt.subplot2grid((24, 24), (3, 12), rowspan=3, colspan=3)
-            ax2b.tick_params(axis='both', length=2)
-            ax2b.set_xticklabels([])
-            ax2b.set_yticklabels([])
-            ax2c = plt.subplot2grid((24, 24), (6, 12), rowspan=3, colspan=3)
-            ax2c.tick_params(axis='both', length=2)
-            ax2c.set_xticklabels([])
-            ax2c.set_yticklabels([])
-            ax2d = plt.subplot2grid((24, 24), (9, 12), rowspan=3, colspan=3)
-            ax2d.tick_params(axis='both', length=2)
-            ax2d.set_xticklabels([])
-            ax2d.set_yticklabels([])
-
-
-            for i,(score,ax) in enumerate(zip(scores,[ax1, ax2a, ax2b, ax2c, ax2d])):
-                plot_msms_comparison(i, score, ax, msv_sample_list[i], msv_ref_list[i])
-
-
-
-            #Next Best Scores and Filenames
-            ax4a = plt.subplot2grid((24, 24), (0, 15), rowspan=3, colspan=1)
-            ax4a.axis('off')
-            ax4b = plt.subplot2grid((24, 24), (3, 15), rowspan=3, colspan=1)
-            ax4b.axis('off')
-            ax4c = plt.subplot2grid((24, 24), (6, 15), rowspan=3, colspan=1)
-            ax4c.axis('off')
-            ax4d = plt.subplot2grid((24, 24), (9, 15), rowspan=3, colspan=1)
-            ax4d.axis('off')
-
+            #  Top 5 MSMS Spectra
+            top_5_axis = [plt.subplot2grid((24, 24), (0, 0), rowspan=12, colspan=12)]
+            for i in [0, 3, 6, 9]:
+                top_5_axis.append(plt.subplot2grid((24, 24), (i, 12), rowspan=3, colspan=3))
+                top_5_axis[-1].tick_params(axis='both', length=2)
+                top_5_axis[-1].set_xticklabels([])
+                top_5_axis[-1].set_yticklabels([])
+            for i, (score, axis) in enumerate(zip(scores, top_5_axis)):
+                plot_msms_comparison(i, score, axis, msv_sample_list[i], msv_ref_list[i])
+
+            def no_axis_plot(i):
+                axis = plt.subplot2grid((24, 24), (i, 15), rowspan=3, colspan=1)
+                axis.axis('off')
+                return axis
+
+            # Next Best Scores and Filenames
+            next_best = [no_axis_plot(i) for i in [0, 3, 6, 9]]
 
             if short_names_df.empty:
-                for i,(score,ax) in enumerate(zip(scores[1:],[ax4a, ax4b, ax4c, ax4d])):
-                    plot_score_and_ref_file(ax, score, rt_list[i+1], os.path.basename(data[file_idxs[i+1]][compound_idx]['lcmsrun'].hdf5_file))
+                for i, (score, axis) in enumerate(zip(scores[1:], next_best)):
+                    plot_score_and_ref_file(axis, score, rt_list[i+1], os.path.basename(data[file_idxs[i+1]][compound_idx]['lcmsrun'].hdf5_file))
             else:
-                for i,(score,ax) in enumerate(zip(scores[1:],[ax4a, ax4b, ax4c, ax4d])):
-                    short_samplename  = short_names_df.loc[os.path.basename(data[file_idxs[i+1]][compound_idx]['lcmsrun'].hdf5_file).split('.')[0], 'short_samplename'][0]
+                for i, (score, ax) in enumerate(zip(scores[1:], next_best)):
+                    short_samplename = short_names_df.loc[os.path.basename(data[file_idxs[i+1]][compound_idx]['lcmsrun'].hdf5_file).split('.')[0], 'short_samplename'][0]
                     plot_score_and_ref_file(ax, score, rt_list[i+1], short_samplename)
 
-
-        #EMA Compound Info
+        # EMA Compound Info
         if file_idxs and file_idxs[0] is not None:
             ax3 = plt.subplot2grid((24, 24), (0, 16), rowspan=6, colspan=8)
-            plot_ema_compound_info(ax3, data[file_idxs[0]][compound_idx]['identification'])#,
-                               # ma_data.get_compound_names(data,use_labels=True)[0][compound_idx])
+            plot_ema_compound_info(ax3, data[file_idxs[0]][compound_idx]['identification'])
         else:
             ax3 = plt.subplot2grid((24, 24), (0, 0), rowspan=6, colspan=8)
-            plot_ema_compound_info(ax3, data[0][compound_idx]['identification'])#,
-
+            plot_ema_compound_info(ax3, data[0][compound_idx]['identification'])
 
-        #Structure
+        # Structure
         if file_idxs and file_idxs[0] is not None:
             ax5 = plt.subplot2grid((24, 24), (13, 0), rowspan=6, colspan=6)
             plot_structure(ax5, data[file_idxs[0]][compound_idx]['identification'].compound, 100)
@@ -2514,7 +2360,7 @@ def make_identification_figure_v2(
             ax5 = plt.subplot2grid((24, 24), (13, 0), rowspan=6, colspan=6)
             plot_structure(ax5, data[0][compound_idx]['identification'].compound, 100)
 
-        #EIC
+        # EIC
         if file_idxs and file_idxs[0] is not None:
             ax6 = plt.subplot2grid((21, 21), (6, 15), rowspan=5, colspan=6)
             plot_eic(ax6, data, compound_idx)
@@ -2522,11 +2368,7 @@ def make_identification_figure_v2(
             ax6 = plt.subplot2grid((21, 21), (6, 0), rowspan=5, colspan=6)
             plot_eic(ax6, data, compound_idx)
 
-#             #Reference and Sample Info
-#             ax10 = plt.subplot2grid((24, 24), (14, 6), rowspan=10, colspan=20)
-#             plot_ref_sample_info(ax10, 1, 1)
-
-        #Old code
+        # Old code
         if file_idxs and file_idxs[0] is not None:
             ax7 = plt.subplot2grid((24, 24), (15, 6), rowspan=9, colspan=20)
             mz_theoretical = data[file_idxs[0]][compound_idx]['identification'].mz_references[0].mz
@@ -2560,20 +2402,26 @@ def make_identification_figure_v2(
                          fill('Matching M/Zs above 1E-3*max: ' + ', '.join(['%5.3f'%m for m in threshold_mz_sample_matches]), width=90) + '\n\n' +
                          fill('All Matching M/Zs: ' + ', '.join(['%5.3f'%m for m in mz_sample_matches]), width=90),
                          fontsize=6, verticalalignment='top')
-                df.loc[compound_idx, 'label'] = compound_names[compound_idx]
-                df.loc[compound_idx, 'file name'] = file_names[file_idxs[0]]
-                df.loc[compound_idx, 'RT'] = rt_list[0]
-                df.loc[compound_idx, 'score'] = scores[0]
-                df.loc[compound_idx, 'Matching M/Zs above 1E-3*max'] =', '.join(['%5.3f'%m for m in threshold_mz_sample_matches])
-                df.loc[compound_idx, 'All matching M/Zs'] = ','.join(['%5.3f'%m for m in mz_sample_matches])
+                match.loc[compound_idx, 'label'] = compound_names[compound_idx]
+                match.loc[compound_idx, 'file name'] = file_names[file_idxs[0]]
+                match.loc[compound_idx, 'RT'] = rt_list[0]
+                match.loc[compound_idx, 'score'] = scores[0]
+                match.loc[compound_idx, 'Matching M/Zs above 1E-3*max'] = ', '.join(['%5.3f' % m for m in threshold_mz_sample_matches])
+                match.loc[compound_idx, 'All matching M/Zs'] = ','.join(['%5.3f' % m for m in mz_sample_matches])
 
             ax7.set_ylim(.5,1.1)
             ax7.axis('off')
 
-        plt.tight_layout()
-        plt.savefig(os.path.join(output_loc, compound_names[compound_idx] + '.pdf'))
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="tight_layout not applied: number of rows in subplot specifications must be multiples of one another.")
+            plt.tight_layout()
+        fig_path = os.path.join(output_loc, compound_names[compound_idx] + '.pdf')
+        write_utils.check_existing_file(fig_path, overwrite)
+        plt.savefig(fig_path)
         plt.close()
-    df.to_csv(os.path.join(output_loc, 'MatchingMZs.tab'),sep='\t')
+        logger.info('Exported identification figures for %s to %s.', compound_names[compound_idx], fig_path)
+    match_path = os.path.join(output_loc, 'MatchingMZs.tab')
+    write_utils.export_dataframe(match, match_path, 'matching MZs', overwrite, sep='\t')
 
 
 def plot_ms1_spectra(polarity = None, mz_min = 5, mz_max = 5, input_fname = '', input_dataset = [], compound_names = [],  include_lcmsruns = [], exclude_lcmsruns = [], include_groups = [], exclude_groups = [], output_loc = []):
@@ -2588,16 +2436,7 @@ def plot_ms1_spectra(polarity = None, mz_min = 5, mz_max = 5, input_fname = '',
         data = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         data = input_dataset
-
-    if include_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_include_list(data, 'lcmsrun', include_lcmsruns)
-    if include_groups:
-        data = filter_lcmsruns_in_dataset_by_include_list(data, 'group', include_groups)
-
-    if exclude_lcmsruns:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'lcmsrun', exclude_lcmsruns)
-    if exclude_groups:
-        data = filter_lcmsruns_in_dataset_by_exclude_list(data, 'group', exclude_groups)
+    data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
 
     #Make sure there is data
     assert(len(data) != 0)
@@ -3265,43 +3104,59 @@ def filter_empty_metatlas_objects(object_list,field):
             pass
     return filtered_list
 
-def filter_metatlas_objects_by_list(object_list,field,filter_list):
-    filtered_list = []
-    for i,g in enumerate(object_list):
-        if any(ext in getattr(g,field) for ext in filter_list):
-            filtered_list.append(g)
-    return filtered_list
 
-def remove_metatlas_objects_by_list(object_list,field,filter_list):
-    filtered_list = []
-    for i,g in enumerate(object_list):
-        if not any(ext in getattr(g,field) for ext in filter_list):
-            filtered_list.append(g)
-    return filtered_list
+def filter_metatlas_objects_by_list(object_list, field, filter_list):
+    """
+    inputs:
+        object_list: iterable to be filtered by its attribute values
+        field: name of attribute to filter on
+        filter_list: strings that are tested to see if they are substrings of the attribute value
+    returns filtered list of objects that have a match in filter_list
+    """
+    return filter_by_list(object_list, lambda x: getattr(x, field), filter_list)
+
+
+def remove_metatlas_objects_by_list(object_list, field, filter_list):
+    """
+    inputs:
+        object_list: iterable to be filtered by its attribute values
+        field: name of attribute to filter on
+        filter_list: strings that are tested to see if they are substrings of the attribute value
+    returns filtered list of objects that do not have matches to filter_list
+    """
+    return filter_by_list(object_list, lambda x: getattr(x, field), filter_list, include=False)
+
+
+def filter_by_list(data, key_func, term_list, include=True):
+    """
+    inputs:
+        data: iterable to be filtered
+        key_func: function that takes a member of d and returns string to compare with term_list
+        term_list: strings that are tested to see if they are substrings of key_func return value
+        include: if True, then matches are included in output, else matches are excluded
+    """
+    allow = any if include else lambda x: not any(x)
+    return [d for d in data if allow(ext in key_func(d) for ext in term_list)]
 
-def filter_lcmsruns_in_dataset_by_include_list(metatlas_dataset,selector,include_list):
+
+def filter_lcmsruns_in_dataset_by_include_list(metatlas_dataset, selector, include_list):
     """
-    Returns a metatlas dataset containing LCMS runs or groups (denoted by selector) that have substrings listed in the include list
+    Returns a metatlas dataset containing LCMS runs or groups (denoted by selector) that have substrings
+    listed in the include list.
     selector can be 'lcmsrun' or 'group'
     include_list will look something like this: ['QC','Blank']
     """
-    filtered_dataset = []
-    for d in metatlas_dataset:
-        if any(ext in d[0][selector].name for ext in include_list):
-            filtered_dataset.append(d)
-    return filtered_dataset
+    return filter_by_list(metatlas_dataset, lambda x: x[0][selector].name, include_list)
+
 
-def filter_lcmsruns_in_dataset_by_exclude_list(metatlas_dataset,selector,exclude_list):
+def filter_lcmsruns_in_dataset_by_exclude_list(metatlas_dataset, selector, exclude_list):
     """
-    Returns a metatlas dataset containing LCMS runs or groups (denoted by selector) that have substrings not listed in the include list
+    Returns a metatlas dataset containing LCMS runs or groups (denoted by selector) that have substrings
+    not listed in the include list.
     selector can be 'lcmsrun' or 'group'
     exclude_list will look something like this: ['QC','Blank']
     """
-    filtered_dataset = []
-    for d in metatlas_dataset:
-        if not any(ext in d[0][selector].name for ext in exclude_list):
-            filtered_dataset.append(d)
-    return filtered_dataset
+    return filter_by_list(metatlas_dataset, lambda x: x[0][selector].name, exclude_list, include=False)
 
 
 def filter_compounds_in_dataset_by_exclude_list(metatlas_dataset,exclude_list):
diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index f80ad3d3..de9a4cdc 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -1,19 +1,21 @@
 from __future__ import absolute_import
 from __future__ import print_function
-import sys
+import logging
 import os
 import multiprocessing as mp
 import pprint
+from six.moves import range
+
+import numpy as np
+import pandas as pd
 
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
-from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 from metatlas.plots import chromplotplus as cpp
 from metatlas.tools import spectralprocessing as sp
 
-import numpy as np
-import pandas as pd
-from six.moves import range
+logger = logging.getLogger(__name__)
 
 loose_param = {'min_intensity': 1e3,
                'rt_tolerance': .25,
@@ -47,7 +49,8 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
                                      'mz_centroid': ['peak_height', 'mz_ppm'],
                                      'mz_ppm': ['peak_height'],
                                      'msms_score': ['peak_height', 'num_frag_matches'],
-                                     'num_frag_matches': ['peak_height', 'msms_score']}):
+                                     'num_frag_matches': ['peak_height', 'msms_score']},
+                     overwrite=False):
 
     assert output_loc is not None or return_all
 
@@ -56,43 +59,21 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     else:
         metatlas_dataset = input_dataset
 
-    if output_loc is not None and not os.path.exists(output_loc):
-        os.mkdir(output_loc)
-    if output_loc is not None and not os.path.exists(os.path.join(output_loc,'data_sheets')):
-        os.mkdir(os.path.join(output_loc,'data_sheets'))
-    if output_loc is not None and not os.path.exists(os.path.join(output_loc,'stats_tables')):
-        os.mkdir(os.path.join(output_loc,'stats_tables'))
-
-    if polarity != '':
-        output_sheetname = polarity+'_'+output_sheetname
-
-    # filter runs from the metatlas dataset
-    if include_lcmsruns:
-        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list(metatlas_dataset,'lcmsrun',include_lcmsruns)
-    if include_groups:
-        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list(metatlas_dataset,'group',include_groups)
-
-    if exclude_lcmsruns:
-        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list(metatlas_dataset,'lcmsrun',exclude_lcmsruns)
-    if exclude_groups:
-        metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list(metatlas_dataset,'group',exclude_groups)
-
+    metrics = ['msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak', 'rt_delta',
+               'peak_height', 'peak_area', 'num_data_points']
+    dfs = {m: None for m in metrics}
+    for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']:
+        dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname=metric, use_labels=use_labels, output_loc=os.path.join(output_loc, 'data_sheets'), polarity=polarity)
+    metatlas_dataset = dp.filter_runs(metatlas_dataset, include_lcmsruns, include_groups,
+                                      exclude_lcmsruns, exclude_groups)
     final_df = pd.DataFrame(columns=['index'])
     file_names = ma_data.get_file_names(metatlas_dataset)
-    compound_names = ma_data.get_compound_names(metatlas_dataset,use_labels=use_labels)[0]
-
-    metrics = ['msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak', 'rt_delta', 'peak_height', 'peak_area', 'num_data_points']
-
-    dfs = {m:None for m in metrics}
-    passing = {m:np.ones((len(compound_names), len(file_names))).astype(float) for m in metrics}
-
-    for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']:
-        dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname=metric, use_labels=use_labels,output_loc=os.path.join(output_loc,'data_sheets'), polarity=polarity)
+    compound_names = ma_data.get_compound_names(metatlas_dataset, use_labels=use_labels)[0]
+    passing = {m: np.ones((len(compound_names), len(file_names))).astype(float) for m in metrics}
 
     dfs['mz_ppm'] = dfs['peak_height'].copy()
     dfs['mz_ppm'] *= np.nan
-
-    dfs['num_data_points'] = pd.DataFrame([[len(metatlas_dataset[i][j]['data']['eic']['intensity']) if metatlas_dataset[i][j]['data']['eic'] != None else 0
+    dfs['num_data_points'] = pd.DataFrame([[len(ma_data.extract(metatlas_dataset, [i, j, 'data', 'eic', 'intensity'], default=[]))
                                            for i in range(len(metatlas_dataset))]
                                           for j in range(len(metatlas_dataset[0]))])
     dfs['num_data_points'].index = dfs['mz_ppm'].index
@@ -160,7 +141,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
 
         avg_mz_measured = np.mean(avg_mz_measured)
         avg_rt_measured = np.mean(avg_rt_measured)
-        
+
         delta_mz = abs(mz_theoretical - avg_mz_measured)
         delta_ppm = delta_mz / mz_theoretical * 1e6
 
@@ -172,7 +153,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         else:
             cid_label = cid.compound[0].name
             final_df.loc[compound_idx, 'label'] = cid_label
-        
+
         overlapping_compounds = []
         cpd_labels = []
         inchi_key_map = {}
@@ -181,7 +162,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
                 cpd_labels.append(metatlas_dataset[0][compound_iterator]['identification'].name)
             else:
                 cpd_labels.append(metatlas_dataset[0][compound_iterator]['identification'].compound[0].name)
-        
+
         if(len(cid.compound) != 0):
             #Loop through compounds to identify overlapping compounds
             for compound_iterator in range(len(compound_names)):
@@ -237,7 +218,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'polarity'] = cid.mz_references[0].detected_polarity
             final_df.loc[compound_idx, 'exact_mass'] = cid.compound[0].mono_isotopic_molecular_weight
             final_df.loc[compound_idx, 'inchi_key'] = cid.compound[0].inchi_key
-        
+
         if file_idxs != []:
             if len(mz_sample_matches) == 1:
                 final_df.loc[compound_idx, 'msms_quality'] = 0
@@ -247,7 +228,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
                 final_df.loc[compound_idx, 'msms_quality'] = ""
         else:
             final_df.loc[compound_idx, 'msms_quality'] = 0
-        
+
         if delta_ppm <= 5 or delta_mz <= 0.001:
             final_df.loc[compound_idx, 'mz_quality'] = 1
         elif delta_ppm >= 5 and delta_ppm <= 10 and delta_mz > 0.001:
@@ -256,7 +237,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'mz_quality'] = 0
         else:
             final_df.loc[compound_idx, 'mz_quality'] = ""
-        
+
         rt_error = abs(cid.rt_references[0].rt_peak - avg_rt_measured)
         if rt_error <= 0.5:
             final_df.loc[compound_idx, 'rt_quality'] = 1
@@ -266,7 +247,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'rt_quality'] = 0
         else:
             final_df.loc[compound_idx, 'rt_quality'] = ""
-        
+
         final_df.loc[compound_idx, 'total_score'] = ""
         final_df.loc[compound_idx, 'msi_level'] = ""
         final_df.loc[compound_idx, 'isomer_details'] = ""
@@ -331,11 +312,15 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >= min_msms_score).astype(float)
     passing['num_frag_matches'] = (np.nan_to_num(dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float)
 
+    prefix = f"{polarity}_" if polarity != '' else ''
+    output_sheetname = f"{prefix}{output_sheetname}"
     if not output_sheetname.endswith('.xlsx'):
         output_sheetname = output_sheetname + '.xlsx'
-    writer = pd.ExcelWriter(os.path.join(output_loc,output_sheetname), engine='xlsxwriter')
+    excel_path = os.path.join(output_loc, output_sheetname)
+    write_utils.check_existing_file(excel_path, overwrite)
+    writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
     final_df.to_excel(writer, sheet_name='Final_Identifications', index=False, startrow=3)
-    
+
     #set format
     workbook = writer.book
     f_blue = workbook.add_format({'bg_color': '#DCFFFF'})
@@ -365,7 +350,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         worksheet.write(1,i, header, cell_format)
 
     HEADER3 = ['Unique for study','Some isomers are not chromatographically or spectrally resolvable.','Name of standard reference compound in library match.','compound with similar mz (abs difference <= 0.005) or monoisotopic molecular weight (abs difference <= 0.005) and RT (min or max within the RT-min-max-range of similar compound)','List of inchi keys that correspond to the compounds listed in the previous column','','','monoisotopic mass (neutral except for permanently charged molecules)','neutralized version','1 (MSMS matches ref. std.), 0.5 (possible match), 0 (no MSMS collected or no appropriate ref available), -1 (bad match)','1 (delta ppm </= 5 or delta Da </= 0.001), 0.5 (delta ppm 5-10 and delta Da > 0.001), 0 (delta ppm > 10) mz_quality','1 (delta RT </= 0.5), 0.5 (delta RT > 0.5 & </= 2), 0 (delta RT > 2 min)','sum of m/z, RT and MSMS score','Level 1 = Two independent and orthogonal properties match authentic standard; else = putative [Metabolomics. 2007 Sep; 3(3): 211-221. doi: 10.1007/s11306-007-0082-2]','Isomers have same formula (and m/z) and similar RT - MSMS spectra may be used to differentiate (exceptions) or RT elution order','','','','','','','','','mean # of fragment ions matching between compound in sample and reference compound / standard; may include parent and isotope ions and very low intensity background ions (these do not contribute to score)','','MSMS score (highest across all samples), scale of 0 to 1 based on an algorithm. 0 = no match, 1 = perfect match. If no score, then no MSMS was acquired for that compound (@ m/z & RT window).','More than one may be detectable; the one evaluated is listed','theoretical m/z for a given compound / adduct pair','average m/z within 20ppm of theoretical detected across all samples @ RT peak','absolute difference between theoretical and detected m/z','ppm difference between theoretical and detected m/z','','','theoretical retention time for a compound based upon reference standard at highest intensity point of peak','average retention time for a detected compound at highest intensity point of peak across all samples','absolute difference between theoretical and detected RT peak']
-    
+
     for i, header in enumerate(HEADER3):
         worksheet.write(2,i, header, cell_format)
 
@@ -378,7 +363,8 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     worksheet.conditional_format('AF1:AI'+str(len(final_df)+4),{ 'type':'no_errors', 'format':f_yellow})
     worksheet.conditional_format('AJ1:AJ'+str(len(final_df)+4),{ 'type':'no_errors', 'format':f_rose})
     writer.save()
-    
+    logger.info('Exported Draft Identifications spreadsheet to %s.', excel_path)
+
 
     #final_df.to_csv(os.path.join(output_loc, 'Draft_Final_Idenfications.tab'), sep='\t')
     for metric in metrics:
@@ -408,14 +394,12 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     stats_table = pd.concat(stats_table, axis=1)
 
     if output_loc is not None:
-        if polarity == '':
-            readme_file = 'stats_table.readme'
-            stats_table.to_csv(os.path.join(output_loc, 'stats_tables', 'stats_table.tab'), sep='\t')
-        else:
-            readme_file = polarity+'_stats_table.readme'
-            stats_table.to_csv(os.path.join(output_loc, 'stats_tables', polarity+'_stats_table.tab'), sep='\t')
-
-        with open(os.path.join(output_loc, 'stats_tables/', readme_file), 'w') as readme:
+        stats_tables_dir = os.path.join(output_loc, f"{prefix}stats_tables")
+        stats_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.tab")
+        write_utils.export_dataframe(stats_table, stats_path, 'stats table', overwrite, sep='\t')
+        readme_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.readme")
+        write_utils.check_existing_file(readme_path, overwrite)
+        with open(readme_path, 'w') as readme:
             for var in ['dependencies', 'min_peak_height', 'rt_tolerance', 'ppm_tolerance', 'min_msms_score', 'min_num_frag_matches']:
                 readme.write('%s\n'%var)
                 try:
@@ -426,7 +410,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
                 except TypeError:
                     pprint.pprint(eval(var), readme)
                 readme.write('\n')
-
+        logger.info('Exported stats table readme to %s.', readme_path)
     if return_all:
         return stats_table, dfs, passing
 
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
new file mode 100644
index 00000000..56bad65d
--- /dev/null
+++ b/metatlas/tools/notebook.py
@@ -0,0 +1,81 @@
+"""Jupyter notebook helper functions"""
+
+import logging
+import os
+import shutil
+import sys
+
+from pathlib import Path
+import pandas as pd
+from IPython.core.display import display, HTML
+from metatlas.tools.logging import activate_logging
+
+logger = logging.getLogger(__name__)
+
+
+def configure_environment(log_level):
+    """
+    Sets environment variables and configures logging
+    inputs:
+        log_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    """
+    activate_logging(console_level=log_level)
+    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+
+
+def validate_kernel():
+    """
+    Raise error if problem with kernel
+    When on NERSC, this will install the correct kernel if needed
+    """
+    allowed_exe = [
+        "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
+    ]
+    error_msg = "Invalid kernel setting in Jupyter Notebook."
+    on_nersc = "METATLAS_LOCAL" not in os.environ
+    if on_nersc and sys.executable not in allowed_exe:
+        install_kernel()
+        logger.critical('Please check that the kernel is set to "Metatlas Targeted".')
+        raise ValueError(error_msg)
+    try:
+        # pylint: disable=import-outside-toplevel,unused-import
+        import dataset  # noqa: F401
+    except ModuleNotFoundError as module_error:
+        logger.critical(
+            'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
+        )
+        raise ModuleNotFoundError from module_error
+
+
+def install_kernel():
+    """
+    Copies kernel.json from repo to active location under home directory.
+    Only for use on NERC!
+    """
+    logger.info('Installing kernel.json for "Metatlas Targeted".')
+    repo_path = Path(__file__).resolve().parent.parent.parent
+    source = repo_path / "notebooks" / "kernels" / "metatlas-targeted.kernel.json"
+    dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
+    os.makedirs(dest_dir, exist_ok=True)
+    shutil.copyfile(source, dest_dir / "kernel.json")
+
+
+def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
+    """Set pandas display options"""
+    pd.set_option("display.max_rows", max_rows)
+    pd.set_option("display.max_columns", max_columns)
+    pd.set_option("display.max_colwidth", max_colwidth)
+
+
+def configure_notebook_display():
+    """Configure output from Jupyter"""
+    # set notebook to have minimal side margins
+    display(HTML("<style>.container { width:100% !important; }</style>"))
+
+
+def setup(log_level):
+    """High level function to prepare the metatlas notebook"""
+    validate_kernel()
+    configure_environment(log_level)
+    configure_notebook_display()
+    configure_pandas_display()
diff --git a/notebooks/kernels/metatlas-targeted.kernel.json b/notebooks/kernels/metatlas-targeted.kernel.json
new file mode 100644
index 00000000..6126bcc4
--- /dev/null
+++ b/notebooks/kernels/metatlas-targeted.kernel.json
@@ -0,0 +1,11 @@
+{
+ "argv": [
+  "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
+  "-m",
+  "ipykernel_launcher",
+  "-f",
+  "{connection_file}"
+ ],
+ "display_name": "Metatlas Targeted",
+ "language": "python"
+}
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index bacf1264..f61666c4 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "tags": [
      "parameters"
@@ -19,11 +19,13 @@
    },
    "outputs": [],
    "source": [
+    "# pylint: disable=invalid-name,missing-module-docstring\n",
+    "\n",
     "# one of 'positive' or 'negative'\n",
-    "polarity = 'positive'\n",
+    "polarity = \"positive\"\n",
     "\n",
     "# one of 'ISTDsEtc' or 'FinalEMA-HILIC'\n",
-    "output_type = 'FinalEMA-HILIC'\n",
+    "output_type = \"FinalEMA-HILIC\"\n",
     "\n",
     "# an integer, increment if you need to redo your analysis\n",
     "# will be appended to your username to create analysis_id\n",
@@ -31,25 +33,25 @@
     "\n",
     "# experiment ID that must match the parent folder containing the LCMS output files\n",
     "# An example experiment ID is '20201116_JGI-AK_LH_506489_SoilWarm_final_QE-HF_HILICZ_USHXG01530'\n",
-    "experiment = 'REPLACE ME'\n",
+    "experiment = \"REPLACE ME\"\n",
     "\n",
     "# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']\n",
     "exclude_files = []\n",
     "\n",
     "# Exclude groups with names containing any of the substrings in this list.\n",
     "# 'POS' or 'NEG' will be auto-appended later, so you shouldn't use them here.\n",
-    "exclude_groups = ['QC','InjBl']\n",
+    "exclude_groups = [\"QC\", \"InjBl\"]\n",
     "\n",
     "# thresholds for filtering out compounds with weak MS1 signals\n",
-    "num_points_passing = 5\n",
-    "peak_height_passing = 4e5\n",
+    "num_points = 5\n",
+    "peak_height = 4e5\n",
     "\n",
     "# include MSMS fragment ions in the output documents?\n",
     "export_msms_fragment_ions = False\n",
     "\n",
     "# list of substrings that will group together when creating groups\n",
     "# this provides additional grouping beyond the default grouping on field #12\n",
-    "groups_controlled_vocab = ['QC','InjBl','ISTD']\n",
+    "groups_controlled_vocab = [\"QC\", \"InjBl\", \"ISTD\"]\n",
     "\n",
     "# list of tuples contain string with color name and substring pattern.\n",
     "# Lines in the EIC plot will be colored by the first substring pattern\n",
@@ -58,17 +60,15 @@
     "# (first is front, last is back). Named colors available in matplotlib\n",
     "# are here: https://matplotlib.org/3.1.0/gallery/color/named_colors.html\n",
     "# or use hexadecimal values '#000000'. Lines default to black.\n",
-    "rt_adjuster_color_list = [('red','ExCtrl'),                                                     \n",
-    "                          ('green','TxCtrl'),\n",
-    "                          ('blue','InjBl')]\n",
+    "line_colors = [(\"red\", \"ExCtrl\"), (\"green\", \"TxCtrl\"), (\"blue\", \"InjBl\")]\n",
     "\n",
     "# The rest of this block contains project independent parameters\n",
     "\n",
     "# Full path to the directory where you have cloned the metatlas git repo.\n",
-    "# If you ran the 'git clone ...' command in your home directory on Cori, \n",
+    "# If you ran the 'git clone ...' command in your home directory on Cori,\n",
     "# then you'll want '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'\n",
     "# where the uppercase letters are replaced based on your NERSC username.\n",
-    "metatlas_repo_path = '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'\n",
+    "metatlas_repo_path = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas\"\n",
     "\n",
     "\n",
     "# Full path to the directory where you want this notebook to store data.\n",
@@ -76,7 +76,7 @@
     "# You can place this anywhere on cori's filesystem, but placing it within your\n",
     "# global home directory is recommended so that you do not need to worry about\n",
     "# your data being purged. Each project will take on the order of 100 MB.\n",
-    "project_directory = '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects'\n",
+    "project_directory = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects\"\n",
     "\n",
     "# maximum number of CPUs to use\n",
     "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
@@ -84,7 +84,7 @@
     "\n",
     "# Threshold for how much status information metatlas functions print in the notebook\n",
     "# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'\n",
-    "log_level = 'INFO'"
+    "log_level = \"INFO\""
    ]
   },
   {
@@ -100,205 +100,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%matplotlib notebook\n",
-    "\n",
-    "import sys, os\n",
-    "os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'\n",
+    "# pylint: disable=wrong-import-position,import-error\n",
+    "import logging  # noqa: E402\n",
+    "import os  # noqa: E402\n",
+    "import sys  # noqa: E402\n",
     "\n",
     "sys.path.insert(0, metatlas_repo_path)\n",
-    "try:\n",
-    "    import dataset\n",
-    "except ModuleNotFoundError:\n",
-    "    print('Could not find dataset module. Please check that the kernel is set to \"metatlas py3\".')\n",
-    "    raise ValueError('Invalid kernel setting in Jupyter Notebook.')\n",
+    "logger = logging.getLogger(\"metatlas.jupyter\")\n",
+    "logger.debug(\"sys.executable=%s\", sys.executable)\n",
+    "logger.debug(\"sys.path=%s.\", sys.path)\n",
+    "logger.debug(\"metatlas_repo_path=%s.\", metatlas_repo_path)\n",
     "if not os.path.exists(metatlas_repo_path):\n",
-    "    print('Directory set for metatlas_repo_path parameter does not exists.')\n",
-    "    raise ValueError('Invalid metatlas_repo_path parameter in Jupyter Notebook.')\n",
+    "    logging.critical(\n",
+    "        \"Directory set for metatlas_repo_path parameter (%s) does not exist or is not accessible.\",\n",
+    "        metatlas_repo_path,\n",
+    "    )\n",
+    "    raise ValueError(\"Invalid metatlas_repo_path parameter in Jupyter Notebook.\")\n",
     "try:\n",
-    "    from metatlas.tools import fastanalysis as fa\n",
-    "except ModuleNotFoundError:\n",
-    "    print('Could not find metatlas module. In the Parameters block, please check the value of metatlas_repo_path.')\n",
-    "    raise ValueError('Invalid metatlas_repo_path parameter in Jupyter Notebook.')\n",
-    "from metatlas.plots import dill2plots as dp\n",
-    "from metatlas.io import metatlas_get_data_helper_fun as ma_data\n",
-    "from metatlas.datastructures import metatlas_objects as metob\n",
-    "from metatlas.datastructures import metatlas_dataset as mads\n",
-    "from metatlas.tools.logging import activate_logging\n",
-    "\n",
-    "import getpass\n",
-    "import logging\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import pickle\n",
-    "import time\n",
-    "from functools import partial\n",
-    "from importlib import reload\n",
-    "from pathlib import Path\n",
-    "from IPython.core.display import display, HTML\n",
-    "\n",
-    "if polarity not in ['positive', 'negative']:\n",
-    "    raise ValueError('Parameter polarity is not one of \"positive\" or \"negative\".')\n",
-    "\n",
-    "if output_type == 'ISTDsEtc':\n",
-    "    pass\n",
-    "elif output_type == 'FinalEMA-HILIC':\n",
-    "    num_data_points_passing = 5\n",
-    "    peak_height_passing = 4e5\n",
-    "else:\n",
-    "    raise ValueError('Parameter output_type is not one of \"ISTDsEtc\" or \"FinalEMA-HILIC\".')\n",
-    "\n",
-    "if experiment == 'Replace me':\n",
-    "    raise ValueError('Parameter experiment has not been set.')\n",
-    "if len(experiment.split('_')) != 9:\n",
-    "    raise ValueError('Parameter experiment does contain 9 fields when split on \"_\".')\n",
-    "\n",
-    "activate_logging(console_level=log_level)\n",
-    "logger = logging.getLogger('metatlas.jupyter')\n",
-    "\n",
-    "username = getpass.getuser()\n",
-    "analysis_id = f\"{username}{analysis_number}\"\n",
-    "output_dir = os.path.join(project_directory, experiment, analysis_id, output_type)\n",
-    "short_experiment_analysis_id = experiment.split('_')[0]+'_'+experiment.split('_')[3]+'_'+analysis_id\n",
-    "\n",
-    "os.makedirs(project_directory, exist_ok=True)\n",
-    "os.makedirs(output_dir, exist_ok=True)\n",
-    "\n",
-    "# set notebook to have minimal side margins\n",
-    "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
-    "\n",
-    "pd.set_option('display.max_rows', 5000)\n",
-    "pd.set_option('display.max_columns', 500)\n",
-    "pd.set_option('display.max_colwidth', 100)\n",
-    "\n",
-    "logger.info(\"experiment=%s, analysis_id=%s, short_experiment_analysis_id=%s\", experiment, analysis_id, short_experiment_analysis_id)\n",
-    "logger.info(\"output_dir=%s\", output_dir)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# LCMS filenaming convention\n",
-    "\n",
-    "### You must assign your raw files into experimental groups for analysis.  These are used for downstream statistics and for selection of specific groups for filtering to subsets of files for analysis (Ex. just pos or just neg).\n",
-    "\n",
-    "The groups are created from common file headers and the unique group names. The convention our lab group uses for filenames is as follows: \n",
-    "***\n",
-    "DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ_SAMPLENUMBER_ SAMPLEGROUP_REP_OPTIONAL_SEQ \n",
-    "\n",
-    "Ex.:20180105_SK_AD_ENIGMA_PseudoInt_R2ADec2017_QE119_50454_123456_POS_MSMS_001_Psyringae-R2A-30C-20hr_Rep01_NA_Seq001.raw\n",
-    "***\n",
-    "The common header consists of the fields 0-10: DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ \n",
-    "\n",
-    "The sample group name is commonly field # 12 (between underscore 11 and 12) -0 indexed-\n",
-    "# Find your files\n",
-    "1. On the first line of the block below, set the 'experiment' and 'name' variables to find your files.  These fields require wildcards for partial string searches\n",
-    "2. 'Experiment' is the folder name within global/project/projectdirs/metatlas/raw_data, that will be emailed to you when the files are uploaded to NERSC.  You can also look in the raw_data directory for the NERSC user who uploaded your files; your experiment folder should be in there.\n",
-    "3. 'name' is string that will match a subset of your files within that folder.  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "files = dp.get_metatlas_files(experiment = experiment,name = '%',most_recent = True)\n",
-    "df = metob.to_dataframe(files)\n",
-    "logger.info(\"Number of LCMS output files matching '%s' is: %d.\", experiment, len(files))\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Make Groups\n",
-    "This will attempt to create groups in an automated fashion (rather than filling out a spreadsheet with a list of files and group names).  If your files are all in one folder at nersc, you can use this options.  If not, use option B below.\n",
-    "\n",
-    "A long group name consisting of the common header + either controlled vocab value or field #12 along with a short group name (just controlled vocab or field #12) will be stored in a local variable.  The short group names can be used on plots.\n",
-    "\n",
-    "\n",
-    "1. STEP 1: View the groups\n",
-    "    1. Pick an experiment folder to look for files in on the metob.retrieve function\n",
-    "    2. Enter controlled vocabulary for control files to put select files into groups when control string may be in a different field (not #12) or as a randomly placed substring within a field (ex. if 'InjBl' is included in your controlled vocab list, files like _InjBl-MeOH_ and _StartInjBl_ will group together)\n",
-    "    3. If your group name is not between _ 11 and 12 you can adjust those values in the split commands below.  All other (non-controlledvocab) groups will be created from that field.\n",
-    "2. STEP 2: Create the groups variable after checking the output from STEP 1\n",
-    "3. STEP 3: <br />\n",
-    "    Option A: If everything looks fine the group names and short names, Store groups once you know you have files in correct groups by running and checking the output of STEPS 1 and 2.<br />\n",
-    "    Option B (optional): If you would like to edit the groups, uncomment the options B-I and B-II. Run Option B-I to export a prefilled tab infosheet. Edit the file and then run Option B-II to import the new groups and save it. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#STEP 1: View the groups\n",
-    "files = dp.get_metatlas_files(experiment = experiment,name = '%',most_recent = True)\n",
-    "file_dict = {}\n",
-    "groups_dict = {}\n",
-    "for f in files:\n",
-    "    if not any(map(f.name.__contains__, exclude_files)):\n",
-    "        k = f.name.split('.')[0]\n",
-    "        #     get index if any controlled vocab in filename\n",
-    "        indices = [i for i, s in enumerate(groups_controlled_vocab) if s.lower() in k.lower()]\n",
-    "        prefix = '_'.join(k.split('_')[:11])\n",
-    "        if len(indices)>0:\n",
-    "            short_name = groups_controlled_vocab[indices[0]].lstrip('_')\n",
-    "            group_name = '%s_%s_%s'%(prefix,analysis_id,short_name)\n",
-    "            short_name = k.split('_')[9]+'_'+short_name # Prepending POL to short_name\n",
-    "        else:\n",
-    "            short_name = k.split('_')[12]\n",
-    "            group_name = '%s_%s_%s'%(prefix,analysis_id,short_name)\n",
-    "            short_name = k.split('_')[9]+'_'+k.split('_')[12]  # Prepending POL to short_name\n",
-    "        file_dict[k] = {'file':f,'group':group_name,'short_name':short_name}\n",
-    "        groups_dict[group_name] = {'items':[],'name':group_name,'short_name':short_name}\n",
-    "df = pd.DataFrame(file_dict).T\n",
-    "df.index.name = 'filename'\n",
-    "df.reset_index(inplace=True)#['group'].unique()\n",
-    "df.drop(columns=['file'],inplace=True)\n",
-    "for ug in groups_dict.keys():\n",
-    "    for file_key,file_value in file_dict.items():\n",
-    "        if file_value['group'] == ug:\n",
-    "            groups_dict[ug]['items'].append(file_value['file'])\n",
-    "df.head(100)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#STEP 2: create the groups variable, if the above looks OK\n",
-    "groups = []\n",
-    "for group_key,group_values in groups_dict.items():\n",
-    "    g = metob.Group(name=group_key,items=group_values['items'],short_name=group_values['short_name'])\n",
-    "    groups.append(g)        \n",
-    "    for item in g.items:\n",
-    "        print(g.name,g.short_name,item.name)\n",
-    "    print('')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# STEP 3 Option A: store the groups variable content in the DB (currently only the long group name is stored)\n",
-    "metob.store(groups)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Make data frame of short filenames and samplenames\n",
-    "Uncomment the below 2 blocks to make short file names and smaple names.<br>\n",
-    "This creates a dataframe and a csv file which can be edited, exported and imported. "
+    "    from metatlas.datastructures import metatlas_dataset as mads  # noqa: E402\n",
+    "    from metatlas.tools import notebook  # noqa: E402\n",
+    "except ModuleNotFoundError as err:\n",
+    "    logging.critical(\n",
+    "        (\n",
+    "            \"Could not find metatlas module at %s. \"\n",
+    "            \"In the Parameters block, please check the value of metatlas_repo_path.\"\n",
+    "        ),\n",
+    "        metatlas_repo_path,\n",
+    "    )\n",
+    "    raise ModuleNotFoundError from err\n",
+    "%matplotlib widget\n",
+    "notebook.setup(log_level)"
    ]
   },
   {
@@ -307,33 +138,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Make short_filename and short_samplename \n",
-    "files = metob.retrieve('lcmsruns',experiment=experiment,username='*')\n",
-    "short_filename_delim_ids = [0,2,4,5,7,9,14]\n",
-    "short_samplename_delim_ids = [9,12,13,14]\n",
-    "short_names_df = pd.DataFrame(columns=['sample_treatment','short_filename','short_samplename'])\n",
-    "ctr = 0\n",
-    "for f in files:\n",
-    "    short_filename = []\n",
-    "    short_samplename = []\n",
-    "    tokens = f.name.split('.')[0].split('_')\n",
-    "    for id in short_filename_delim_ids:\n",
-    "        short_filename.append(str(tokens[id]))\n",
-    "    for id in short_samplename_delim_ids:\n",
-    "        short_samplename.append(str(tokens[id]))\n",
-    "    short_filename = \"_\".join(short_filename)\n",
-    "    short_samplename = \"_\".join(short_samplename)\n",
-    "    short_names_df.loc[ctr, 'full_filename'] = f.name.split('.')[0]\n",
-    "    short_names_df.loc[ctr, 'sample_treatment'] = str(tokens[12]) # delim 12\n",
-    "    short_names_df.loc[ctr, 'short_filename'] = short_filename\n",
-    "    short_names_df.loc[ctr, 'short_samplename'] = short_samplename\n",
-    "    short_names_df.loc[ctr, 'last_modified'] = pd.to_datetime(f.last_modified,unit='s')\n",
-    "    ctr +=1\n",
-    "short_names_df.sort_values(by='last_modified', inplace=True)\n",
-    "short_names_df.drop(columns=['last_modified'], inplace=True)\n",
-    "short_names_df.drop_duplicates(subset=['full_filename'], keep='last', inplace=True)\n",
-    "short_names_df.set_index('full_filename', inplace=True)\n",
-    "short_names_df.to_csv(os.path.join(output_dir, 'short_names.csv'), sep=',', index=True)"
+    "ids = mads.AnalysisIdentifiers(experiment, output_type, polarity, analysis_number, project_directory)"
    ]
   },
   {
@@ -349,26 +154,6 @@
     "### Typically, you will run one polarity at a time."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "exclude_groups.append('NEG' if polarity=='positive' else 'POS')\n",
-    "groups = dp.select_groups_for_analysis(name = experiment+'%',\n",
-    "                                       most_recent = True,\n",
-    "                                       remove_empty = True,\n",
-    "                                       include_list = [], exclude_list = exclude_groups)\n",
-    "print(\"sorted groups\")\n",
-    "groups = sorted(groups, key=lambda x: x.name)\n",
-    "for i,a in enumerate(groups):\n",
-    "    print(i, a.name)\n",
-    "metob.to_dataframe(groups)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -385,21 +170,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "name_query = f\"%_{polarity[:3].upper()}_{short_experiment_analysis_id}%\"\n",
-    "atlases = metob.retrieve('atlases', name=f\"%_{polarity[:3].upper()}_{short_experiment_analysis_id}%\", username=username)\n",
-    "for i,a in enumerate(atlases):\n",
-    "    print(i,a.name,pd.to_datetime(a.last_modified,unit='s'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "atlas_idx = 0\n",
-    "metatlas_dataset = mads.MetatlasDataset(atlases[atlas_idx], groups, max_cpus=max_cpus)\n",
-    "ma_data.make_data_sources_tables(groups, metatlas_dataset.atlas, output_dir) "
+    "metatlas_dataset = mads.MetatlasDataset(\n",
+    "    ids, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files\n",
+    ")\n",
+    "metatlas_dataset.lcmsruns_dataframe[\"name\"].tolist()"
    ]
   },
   {
@@ -415,7 +189,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metatlas_dataset.filter_compounds_by_signal(num_points=num_points_passing, peak_height=peak_height_passing)"
+    "if metatlas_dataset.ids.output_type in [\"FinalEMA-HILIC\"]:\n",
+    "    metatlas_dataset.filter_compounds_by_signal(num_points=num_points, peak_height=peak_height)"
    ]
   },
   {
@@ -426,9 +201,7 @@
    },
    "outputs": [],
    "source": [
-    "a = dp.adjust_rt_for_selected_compound(metatlas_dataset, msms_hits=metatlas_dataset.hits,\n",
-    "                                       color_me=rt_adjuster_color_list,\n",
-    "                                       compound_idx=0, alpha=0.5, width=18, height=3)"
+    "agui = metatlas_dataset.annotation_gui(compound_idx=0, width=15, height=3, colors=line_colors)"
    ]
   },
   {
@@ -452,156 +225,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Export Atlas to a Spreadsheet\n",
-    "\n",
-    "The peak flags that you set and selected from the rt adjuster radio buttons will be saved in a column called id_notes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "export_atlas_filename = os.path.join(output_dir, f\"{polarity[:3].upper()}_{metatlas_dataset.atlas.name}_export\")\n",
-    "atlas_identifications = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)\n",
-    "logger.info(\"Exported atlas to file: %s.\", export_atlas_filename)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Export MSMS match scores, stats sheets, and final identification table\n",
-    "\n",
-    "This block creates a number of files:\n",
-    "\n",
-    "1. compound_scores.csv\n",
-    "2. stats_table.tab\n",
-    "3. filtered and unfiltered peak heights, areas, msms scores, mz centroid, mz ppm error, num of fragment matches, rt delta, rt peak\n",
-    "4. final identification sheet that is formatted for use as a supplemental table for manuscript submission.  You will need to manually complete some columns.  Please discuss with Ben, Katherine, Daniel or Suzie before using for the first time.\n",
-    "\n",
-    "THe kwargs below will set the filtering points for the parameters indicated."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kwargs = {'min_intensity': 1e4,   # strict = 1e5, loose = 1e3\n",
-    "          'rt_tolerance': .5,    #>= shift of median RT across all files for given compound to reference\n",
-    "          'mz_tolerance': 20,      # strict = 5, loose = 25; >= ppm of median mz across all files for given compound relative to reference\n",
-    "          'min_msms_score': .6, 'allow_no_msms': True,     # strict = 0.6, loose = 0.3 <= highest compound dot-product score across all files for given compound relative to reference\n",
-    "          'min_num_frag_matches': 1, 'min_relative_frag_intensity': .001}   # strict = 3 and 0.1, loose = 1, 0.01 number of matching mzs when calculating max_msms_score and ratio of second highest to first highest intensity of matching sample mzs\n",
-    "scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)\n",
-    "scores_df['passing'] = fa.test_scores_df(scores_df, **kwargs)\n",
-    "\n",
-    "pass_atlas_df, fail_atlas_df, pass_dataset, fail_dataset = fa.filter_atlas_and_dataset(scores_df, metatlas_dataset.atlas_df, metatlas_dataset, column='passing')\n",
-    "\n",
-    "fa.make_stats_table(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, output_loc=output_dir, min_peak_height=1e5, use_labels=True, min_msms_score=0.01, min_num_frag_matches=1, include_lcmsruns=[], exclude_lcmsruns=['QC'], polarity=polarity[:3].upper())\n",
-    "scores_df.to_csv(os.path.join(output_dir,'stats_tables', polarity[:3].upper()+'_compound_scores.csv'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Export EIC chromatograms as individual pdfs for each compound\n",
-    "\n",
-    "1.  There are three options for formatting your EIC output using the \"group =\" line below:\n",
-    "    1. 'page' will print each sample group on a new page of a pdf file\n",
-    "    2. 'index' will label each group with a letter\n",
-    "    3. None will print all of the groups on one page with very small subplot labels\n",
-    "2. The Y axis scale can be shared across all files using share_y = True or set to the max within each file using share_y = False\n",
-    "3. To use short names for plots, short_names_df should be provided as input. Additionally the header column to be used for short names should be provided as follows (short_names_df=short_names_df, short_names_header='short_samplename'). Header options are sample_treatment, short_filename, short_samplename. These are optional parameters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "group = 'index' # 'page' or 'index' or None\n",
-    "save = True\n",
-    "share_y = True\n",
-    "\n",
-    "dp.make_chromatograms(input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=['InjBl','QC','Blank','blank'], group=group, share_y=share_y, save=save, output_loc=output_dir, short_names_df=short_names_df, short_names_header='short_samplename', polarity=polarity[:3].upper())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Export MSMS mirror plots as individual pdfs for each compound\n",
-    "\n",
-    "1. use_labels = True will use the compound names you provided in your atlas, if you set it to false, the compounds will be named with the first synonym available from pubchem which could be a common name, iupac name, cas number, vendor part number, etc. \n",
-    "2.  The include and exclude lists will match partial strings in filenames, do not use wildcards.\n",
-    "3. If short_names_df is provided as input, short_samplename is used for plots."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "dp.make_identification_figure_v2(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, use_labels=True, include_lcmsruns=[], exclude_lcmsruns=['InjBl', 'QC', 'Blank', 'blank'], output_loc=output_dir,  short_names_df=short_names_df, polarity=polarity[:3].upper())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data Sheets\n",
-    "1. To include short names in the output, short_names_df should be provided as input to make_output_dataframe. \n",
-    "2. ylabel is optional"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output_dataframe = partial(dp.make_output_dataframe, input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=[], output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)\n",
-    "peak_height = output_dataframe(fieldname='peak_height')\n",
-    "peak_area = output_dataframe(fieldname='peak_area')\n",
-    "mz_peak = output_dataframe(fieldname='mz_peak')\n",
-    "rt_peak = output_dataframe(fieldname='rt_peak')\n",
-    "mz_centroid = output_dataframe(fieldname='mz_centroid')\n",
-    "rt_centroid = output_dataframe(fieldname='rt_centroid')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Box plots"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dp.make_boxplot_plots(rt_peak, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_rt_peak'), ylabel=\"RT Peak\")\n",
-    "dp.make_boxplot_plots(peak_height, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_peak_height'), ylabel=\"Peak Height\")\n",
-    "dp.make_boxplot_plots(mz_centroid, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_mz_centroid'), ylabel=\"MZ Centroid\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Export MSMS fragment Ions"
+    "### Export Outputs"
    ]
   },
   {
@@ -610,58 +234,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if export_msms_fragment_ions:\n",
-    "    intensity_fraction = 0.01\n",
-    "    min_mz = 450.0 #minimum m/z to export in msms\n",
-    "    max_mz = -40.0 # distance from precurosor to export (0.5 is a good number. crazy people use negative numbers)\n",
-    "    scale_intensity = True\n",
-    "    data = []\n",
-    "    for compound_index in range(len(metatlas_dataset[0])):\n",
-    "        max_intensity = 0\n",
-    "        d = {}\n",
-    "        for file_index in range(len(metatlas_dataset)):\n",
-    "            try:\n",
-    "                pk_idx = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_intensity'].argmax()\n",
-    "                pk = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_intensity'][pk_idx]\n",
-    "                precursor_mz = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_MZ'][pk_idx]\n",
-    "                rt = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['rt'][pk_idx]\n",
-    "                if (pk>max_intensity) & (rt>metatlas_dataset[file_index][compound_index]['identification'].rt_references[-1].rt_min) & (rt<metatlas_dataset[file_index][compound_index]['identification'].rt_references[-1].rt_max):\n",
-    "                    good_index = file_index\n",
-    "                    max_intensity = pk\n",
-    "                    final_mz = precursor_mz #save this for filtering below\n",
-    "            except:\n",
-    "                pass\n",
-    "    #     print(compound_index,good_index,max_intensity)\n",
-    "        if max_intensity>0:\n",
-    "            msms = metatlas_dataset[good_index][compound_index]['data']['msms']['data']\n",
-    "            idx = np.argwhere(msms['precursor_intensity']==max_intensity).flatten()\n",
-    "            mz = msms['mz'][idx]\n",
-    "            intensity = msms['i'][idx]\n",
-    "            max_msms_intensity = intensity.max()\n",
-    "            cutoff = intensity_fraction * max_msms_intensity\n",
-    "            conditions = (intensity>cutoff) & (mz>min_mz) & (mz<(final_mz+max_mz))\n",
-    "            if sum(conditions)>0:\n",
-    "                keep_idx = np.argwhere(conditions).flatten()\n",
-    "                mz = str(['%.2f'%x for x in list(mz[keep_idx])]).replace('\\'','')\n",
-    "                if scale_intensity==True:\n",
-    "                    intensity = intensity / intensity.max()\n",
-    "                    intensity = intensity * 1e5\n",
-    "                    intensity = intensity.astype(int)\n",
-    "                intensity = str(['%d'%x for x in list(intensity[keep_idx])]).replace('\\'','')\n",
-    "                spectra = str([mz,intensity]).replace('\\'','')\n",
-    "            else:\n",
-    "                mz = None\n",
-    "                intensity = None\n",
-    "                spectra = None\n",
-    "        else:\n",
-    "            mz = None\n",
-    "            intensity = None\n",
-    "            spectra = None\n",
-    "        data.append({'name':metatlas_dataset[file_index][compound_index]['identification'].name,'spectrum':spectra,'mz':mz,'intensity':intensity})\n",
-    "    data = pd.DataFrame(data)\n",
-    "    data[['name','mz','intensity']].to_csv(os.path.join(output_dir,'spectra_1pct_450cut.csv'),index=None)\n",
-    "    # to look at it type this:\n",
-    "    data.head(20)"
+    "metatlas_dataset.generate_all_outputs()"
    ]
   }
  ],
diff --git a/notebooks/reference/Workflow_Notebook_Metatlas_Stable_v0.1.0_20210303.ipynb b/notebooks/reference/Workflow_Notebook_Metatlas_Stable_v0.1.0_20210303.ipynb
index 341387bb..09e02e6a 100644
--- a/notebooks/reference/Workflow_Notebook_Metatlas_Stable_v0.1.0_20210303.ipynb
+++ b/notebooks/reference/Workflow_Notebook_Metatlas_Stable_v0.1.0_20210303.ipynb
@@ -1162,21 +1162,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "mass spec cori",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "mass_spec_cori"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.14"
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
   }
  },
  "nbformat": 4,
diff --git a/noxfile.py b/noxfile.py
index 0e4d2415..080e95de 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -2,156 +2,182 @@
 
 import nox
 
-py_versions = ['3.8', '3.9']
+py_versions = ["3.8", "3.9"]
 
 nox.options.sessions = [
-        "flake8_diff",
-        "flake8",
-        "black",
-        "pylint-3.8",
-        "unit_tests-3.8",
-        "system_tests-3.8",
-        "update_git_hooks",
-        ]
+    "flake8_diff",
+    "flake8",
+    "black",
+    "pylint-3.8",
+    "unit_tests-3.8",
+    "flake8_nb",
+    "black_nb",
+    "pylint_nb-3.8",
+    "system_tests-3.8",
+    "update_git_hooks",
+]
 
 # files we can run all the checks on, as they don't contain legacy code that
 # has not yet been updated to pass all checks.
 more_checks = [
-    'metatlas/datastructures/metatlas_dataset.py',
-    'metatlas/tools/logging.py',
-    'tests'
-    ]
-
+    "metatlas/io/targeted_output.py",
+    "metatlas/io/write_utils.py",
+    "metatlas/datastructures/metatlas_dataset.py",
+    "metatlas/tools/logging.py",
+    "metatlas/tools/notebook.py",
+    "tests",
+]
+
+# notebooks we can run all the checks on, as they don't contain legacy code that
+# has not yet been updated to pass all checks.
 notebooks = [
-    'notebooks/reference/Targeted.ipynb',
-    ]
+    "notebooks/reference/Targeted.ipynb",
+]
 
 pytest_deps = [
-        'attrs==21.2.0',
-        'coverage==5.5',
-        'iniconfig==1.1.1',
-        'packaging==20.9',
-        'pluggy==0.13.1',
-        'py==1.10.0',
-        'pyparsing==2.4.7',
-        'pytest==6.2.4',
-        'pytest-cov==2.11.1',
-        'pytest-mock==3.6.1',
-        'toml==0.10.2',
-        ]
+    "attrs==21.2.0",
+    "coverage==5.5",
+    "iniconfig==1.1.1",
+    "packaging==20.9",
+    "pluggy==0.13.1",
+    "py==1.10.0",
+    "pyparsing==2.4.7",
+    "pytest==6.2.4",
+    "pytest-cov==2.11.1",
+    "pytest-mock==3.6.1",
+    "toml==0.10.2",
+]
 
 pylint_deps = [
-        'pylint==2.8.2',
-        ]
+    "pylint==2.8.2",
+    "pytest==6.2.4",  # so "import pytest" doesn't get reported
+]
 
 nbqa_deps = [
-        'nbqa==0.8.1',
-        'tokenize-rt==4.1.0',
-        'importlib-metadata==4.0.1',
-        'astroid==2.5.6',
-        'wrapt==1.12.1',
-        'lazy_object_proxy==1.6.0',
-        'isort==5.8.0',
-        ]
+    "nbqa==0.8.1",
+    "tokenize-rt==4.1.0",
+    "importlib-metadata==4.0.1",
+    "astroid==2.5.6",
+    "wrapt==1.12.1",
+    "lazy_object_proxy==1.6.0",
+    "isort==5.8.0",
+]
 
 flake8_deps = [
-        'flake8==3.9.2',
-        'flake8-bugbear==21.4.3',
-        'flake8-builtins==1.5.3',
-        'flake8-comprehensions==3.5.0',
-        ]
+    "flake8==3.9.2",
+    "flake8-bugbear==21.4.3",
+    "flake8-builtins==1.5.3",
+    "flake8-comprehensions==3.5.0",
+]
 
 nox.options.error_on_external_run = True
+REUSE_LARGE_VENV = True
 
 
 @nox.session(python=py_versions[0])
 def flake8_diff(session):
     session.install(*flake8_deps)
-    session.run('sh', '-c', 'git diff -U0 -w --staged HEAD | flake8 --diff', external=True)
+    session.run("sh", "-c", "git diff -U0 -w --staged HEAD | flake8 --diff", external=True)
 
 
 @nox.session(python=py_versions[0])
 def flake8_all(session):
     session.install(*flake8_deps)
-    session.run('flake8', 'metatlas', 'tests')
+    session.run("flake8", "metatlas", "tests")
 
 
 @nox.session(python=py_versions[0])
 def flake8(session):
     session.install(*flake8_deps)
-    session.run('flake8', *more_checks)
+    session.run("flake8", *more_checks)
 
 
 @nox.session(python=py_versions[0])
 def black_all(session):
-    session.install('black')
-    session.run('black', '--check', '--diff', '--color', 'metatlas', 'tests')
+    session.install("black")
+    session.run("black", "--check", "--diff", "--color", "metatlas", "tests")
 
 
 @nox.session(python=py_versions[0])
 def black(session):
-    session.install('black')
-    session.run('black', '--check', '--diff', '--color', *more_checks)
+    session.install("black")
+    session.run("black", "--check", "--diff", "--color", *more_checks)
 
 
 @nox.session(python=py_versions[0])
 def blacken(session):
     """this modifies the files to meet black's requirements"""
-    session.install('black')
-    session.run('black', *more_checks)
+    session.install("black")
+    session.run("black", *more_checks)
 
 
-@nox.session(python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def pylint(session):
-    session.install('-r', 'docker/requirements.txt', *pylint_deps)
-    session.run('pylint', *more_checks)
+    session.install("-r", "docker/requirements.txt", *pylint_deps)
+    session.run("pylint", *more_checks)
 
 
-@nox.session(python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def pylint_nb(session):
-    session.install('-r', 'docker/requirements.txt', *nbqa_deps, *pylint_deps)
-    session.run('nbqa', 'pylint', *notebooks)
+    session.install("-r", "docker/requirements.txt", *nbqa_deps, *pylint_deps)
+    session.run("nbqa", "pylint", *notebooks)
 
 
 @nox.session(python=py_versions[0])
 def flake8_nb(session):
     session.install(*nbqa_deps, *flake8_deps)
-    session.run('nbqa', 'flake8', *notebooks)
+    session.run("nbqa", "flake8", *notebooks)
 
 
 @nox.session(python=py_versions[0])
 def black_nb(session):
-    session.install('black', *nbqa_deps)
-    session.run('nbqa', 'black', '--check', *notebooks)
+    session.install("black", *nbqa_deps)
+    session.run("nbqa", "black", "--check", *notebooks)
 
 
 @nox.session(python=py_versions[0])
 def blacken_nb(session):
     """this modifies notebook files to meet black's requirements"""
-    session.install('black', *nbqa_deps)
-    session.run('nbqa', 'black', '--nbqa-mutate', *notebooks)
+    session.install("black", *nbqa_deps)
+    session.run("nbqa", "black", "--nbqa-mutate", *notebooks)
 
 
-@nox.session(python=py_versions, reuse_venv=True)
+@nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def unit_tests(session):
-    session.install('-r', 'docker/requirements.txt', *pytest_deps)
-    session.run('pytest', '-vv', *session.posargs, '--cov', 'metatlas', 'tests/unit/',
-                env={'METATLAS_LOCAL': 'TRUE'})
+    session.install("-r", "docker/requirements.txt", *pytest_deps)
+    session.run(
+        "pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/", env={"METATLAS_LOCAL": "TRUE"}
+    )
+
+
+@nox.session(python=py_versions[0], reuse_venv=REUSE_LARGE_VENV)
+def cov_report(session):
+    session.install("-r", "docker/requirements.txt", *pytest_deps)
+    session.run(
+        "pytest",
+        *session.posargs,
+        "--cov",
+        "metatlas",
+        "--cov-report",
+        "term-missing",
+        "tests/unit/",
+        env={"METATLAS_LOCAL": "TRUE"}
+    )
 
 
 @nox.session(python=py_versions[0])
 def system_tests(session):
     session.install(*pytest_deps)
-    session.run('pytest', '-vv', *session.posargs, 'tests/system/')
+    session.run("pytest", "-vv", *session.posargs, "tests/system/")
 
 
 @nox.session(python=py_versions[0])
 def install_git_hooks(session):
-    session.install('pre-commit')
-    session.run('pre-commit', 'install')
+    session.install("pre-commit")
+    session.run("pre-commit", "install")
 
 
 @nox.session(python=py_versions[0])
 def update_git_hooks(session):
-    session.install('pre-commit')
-    session.run('pre-commit', 'autoupdate')
+    session.install("pre-commit")
+    session.run("pre-commit", "autoupdate")
diff --git a/tests/fixtures/analysis_identifiers_fixtures.py b/tests/fixtures/analysis_identifiers_fixtures.py
new file mode 100644
index 00000000..90d305f2
--- /dev/null
+++ b/tests/fixtures/analysis_identifiers_fixtures.py
@@ -0,0 +1,18 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring
+
+import pytest
+
+from metatlas.datastructures import metatlas_dataset as mads
+
+
+@pytest.fixture(name="analysis_ids")
+def fixture_analysis_ids(tmp_path):
+    return mads.AnalysisIdentifiers(
+        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        "FinalEMA-HILIC",
+        "positive",
+        0,
+        str(tmp_path),
+        atlas="HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+        username="root",
+    )
diff --git a/tests/fixtures/metatlas_dataset_fixtures.py b/tests/fixtures/metatlas_dataset_fixtures.py
index b8e5c5b6..f97bd5ff 100644
--- a/tests/fixtures/metatlas_dataset_fixtures.py
+++ b/tests/fixtures/metatlas_dataset_fixtures.py
@@ -34,20 +34,29 @@ def fixture_msms():
     }
 
 
+@pytest.fixture(name="groups_controlled_vocab")
+def fixture_groups_controlled_vocab():
+    return ["QC", "InjBl", "ISTD"]
+
+
 @pytest.fixture(name="metatlas_dataset")
-def fixture_metatlas_dataset(mocker, df_container, atlas, group):
+def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, atlas, lcmsrun):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
-    return mads.MetatlasDataset(atlas, [group])
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    return mads.MetatlasDataset(analysis_ids, atlas, groups_controlled_vocab)
 
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
-def fixture_metatlas_dataset_with_2_cids(mocker, df_container, atlas_with_2_cids, group):
+def fixture_metatlas_dataset_with_2_cids(
+    mocker, df_container, analysis_ids, groups_controlled_vocab, atlas_with_2_cids, lcmsrun
+):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
-    return mads.MetatlasDataset(atlas_with_2_cids, [group])
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    return mads.MetatlasDataset(analysis_ids, atlas_with_2_cids, groups_controlled_vocab)
 
 
 @pytest.fixture(name="eic")
@@ -321,5 +330,5 @@ def fixture_eic():
 
 
 @pytest.fixture(name="atlas_df")
-def fixture_atlas_df(atlas, group):
-    return mads.MetatlasDataset(atlas, [group]).atlas_df
+def fixture_atlas_df(metatlas_dataset):
+    return metatlas_dataset.atlas_df
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index e8e84344..328decf4 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -35,20 +35,20 @@ def test_targeted_by_line01_with_remove(tmp_path):
             """\
                     jq -M '(.cells[] | select(.source[] | contains("compound_idx=0")).source) \
                                += ["\\n", \
-                                   "a.compound_idx = 0\\n", \
-                                   "a.set_msms_flag(\\"1, co-isolated precursor but all reference ions are in sample spectrum\\")\\n", \
-                                   "a.data.set_rt(0, \\"rt_min\\", 2.1245)\\n", \
-                                   "a.data.set_rt(0, \\"rt_max\\", 2.4439)\\n", \
-                                   "a.compound_idx = 1\\n", \
-                                   "a.set_peak_flag(\\"remove\\")\\n", \
-                                   "a.compound_idx = 2\\n", \
-                                   "a.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
-                                   "a.data.set_rt(2, \\"rt_min\\", 2.4361)\\n", \
-                                   "a.data.set_rt(2, \\"rt_max\\", 2.8608)\\n", \
-                                   "a.compound_idx = 3\\n", \
-                                   "a.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
-                                   "a.data.set_rt(3, \\"rt_min\\", 2.8428)\\n", \
-                                   "a.data.set_rt(3, \\"rt_max\\", 3.3081)\\n" \
+                                   "agui.compound_idx = 0\\n", \
+                                   "agui.set_msms_flag(\\"1, co-isolated precursor but all reference ions are in sample spectrum\\")\\n", \
+                                   "agui.data.set_rt(0, \\"rt_min\\", 2.1245)\\n", \
+                                   "agui.data.set_rt(0, \\"rt_max\\", 2.4439)\\n", \
+                                   "agui.compound_idx = 1\\n", \
+                                   "agui.set_peak_flag(\\"remove\\")\\n", \
+                                   "agui.compound_idx = 2\\n", \
+                                   "agui.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
+                                   "agui.data.set_rt(2, \\"rt_min\\", 2.4361)\\n", \
+                                   "agui.data.set_rt(2, \\"rt_max\\", 2.8608)\\n", \
+                                   "agui.compound_idx = 3\\n", \
+                                   "agui.set_msms_flag(\\"1, perfect match to internal reference library\\")\\n", \
+                                   "agui.data.set_rt(3, \\"rt_min\\", 2.8428)\\n", \
+                                   "agui.data.set_rt(3, \\"rt_max\\", 3.3081)\\n" \
                                   ]' /src/notebooks/reference/Targeted.ipynb > /out/Remove.ipynb &&  \
                     papermill \
                         -p experiment 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 \
@@ -61,6 +61,10 @@ def test_targeted_by_line01_with_remove(tmp_path):
         ],
         check=True,
     )
+    num_files_created = int(
+        subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
+    )
+    assert num_files_created == 38
     with open(out_file, "r") as handle:
         for num, line in enumerate(handle.readlines()):
             clean_line = line.rstrip("\n")
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 3b7b286e..643cadd9 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -2,10 +2,13 @@
 # pylint: disable=missing-function-docstring, protected-access
 
 import datetime
+import glob
+import logging
 import pandas as pd
 import pytest
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import metatlas_get_data_helper_fun as ma_data
 
 
 def test_metatlas_dataset_build01(metatlas_dataset):
@@ -269,13 +272,6 @@ def test_groups01(metatlas_dataset):
     assert metatlas_dataset.groups[0].short_name == "POS_Cone-S1"
 
 
-def test_set_groups01(metatlas_dataset):
-    metatlas_dataset.data  # pylint: disable=pointless-statement
-    metatlas_dataset.groups = None
-    assert not metatlas_dataset._data_valid
-    assert metatlas_dataset.groups is None
-
-
 def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.data  # pylint: disable=pointless-statement
@@ -323,3 +319,195 @@ def test_store_atlas01(metatlas_dataset, sqlite):  # pylint: disable=unused-argu
     metatlas_dataset.store_atlas(even_if_exists=True)
     with pytest.raises(ValueError):
         metatlas_dataset.store_atlas()
+
+
+def test_analysis_identifiers01():
+    with pytest.raises(ValueError):
+        mads.AnalysisIdentifiers(
+            "experiment_not_valid",
+            "output_type_not_valid",
+            "polarity_not_valid",
+            "analysis_number_not_valid",
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers02():
+    with pytest.raises(ValueError):
+        mads.AnalysisIdentifiers(
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "output_type_not_valid",
+            "polarity_not_valid",
+            "analysis_number_not_valid",
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers03():
+    with pytest.raises(ValueError):
+        mads.AnalysisIdentifiers(
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "FinalEMA-HILIC",
+            "polarity_not_valid",
+            "analysis_number_not_valid",
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers04():
+    with pytest.raises(TypeError):
+        mads.AnalysisIdentifiers(
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "FinalEMA-HILIC",
+            "positive",
+            "analysis_number_not_valid",
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers05():
+    with pytest.raises(TypeError):
+        mads.AnalysisIdentifiers(
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "FinalEMA-HILIC",
+            "positive",
+            "1",
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers06():
+    with pytest.raises(ValueError):
+        mads.AnalysisIdentifiers(
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "FinalEMA-HILIC",
+            "positive",
+            -9,
+            "/foo/bar",
+        )
+
+
+def test_analysis_identifiers_atlas01(analysis_ids):
+    assert analysis_ids.atlas == "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0"
+
+
+def test_analysis_identifiers_atlas02(analysis_ids):
+    # call .atlas twice to get cached value
+    analysis_ids.atlas  # pylint: disable=pointless-statement
+    assert analysis_ids.atlas == "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0"
+
+
+def test_write_data_source_files01(metatlas_dataset, mocker, caplog):
+    mocker.patch("glob.glob", return_value=range(10))
+    metatlas_dataset.write_data_source_files()
+    assert "Data sources directory already populated" in caplog.text
+
+
+def test_write_data_source_files02(metatlas_dataset, mocker, caplog):
+    mocker.patch("glob.glob", return_value=range(3))
+    mocker.patch("shutil.rmtree")
+    mocker.patch("metatlas.io.metatlas_get_data_helper_fun.make_data_sources_tables")
+    caplog.set_level(logging.INFO)
+    metatlas_dataset.write_data_source_files()
+    assert "Writing data source files to" in caplog.text
+    assert ma_data.make_data_sources_tables.called  # pylint: disable=no-member
+
+
+def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0])
+    mocker.patch("glob.glob", return_value=range(10))
+    metatlas_dataset = mads.MetatlasDataset(analysis_ids)
+    assert metatlas_dataset.atlas == 0
+
+
+def test_get_atlas02(mocker, analysis_ids, caplog):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
+    caplog.set_level(logging.INFO)
+    with pytest.raises(ValueError):
+        mads.MetatlasDataset(analysis_ids)
+    assert "Database does not contain an atlas named" in caplog.text
+
+
+def test_get_atlas03(mocker, analysis_ids, caplog):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0, 0])
+    with pytest.raises(ValueError):
+        mads.MetatlasDataset(analysis_ids)
+    assert "Database contains more than one atlas named" in caplog.text
+
+
+def test_existing_groups(mocker, metatlas_dataset):
+    """This test has little value, but is needed for coverage"""
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
+    assert metatlas_dataset.existing_groups == []
+
+
+def test_lcmsruns_dataframe(metatlas_dataset):
+    assert metatlas_dataset.lcmsruns_dataframe.shape == (1, 15)
+
+
+def test_store_groups01(metatlas_dataset, mocker):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
+    mocker.patch("metatlas.datastructures.metatlas_objects.store")
+    metatlas_dataset.store_groups()
+    assert metob.store.called  # pylint: disable=no-member
+
+
+def test_store_groups02(metatlas_dataset, mocker):
+    def group():
+        pass
+
+    group.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1"
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[group])
+    with pytest.raises(ValueError):
+        metatlas_dataset.store_groups()
+
+
+def test_annotation_gui01(metatlas_dataset, mocker):
+    mocker.patch(
+        "metatlas.plots.dill2plots.get_msms_hits",
+        return_value=pd.DataFrame(
+            {"score": [], "inchi_key": [], "measured_precursor_mz": []},
+            index=pd.MultiIndex.from_tuples([], names=["msms_scan"]),
+        ),
+    )
+    agui = metatlas_dataset.annotation_gui()
+    agui.compound_idx = 0
+    agui.set_msms_flag("1, co-isolated precursor but all reference ions are in sample spectrum")
+    agui.set_peak_flag("remove")
+    agui.data.set_rt(0, "rt_min", 2.1245)
+    agui.data.set_rt(0, "rt_max", 2.4439)
+    assert metatlas_dataset.rts[0].rt_min == 2.1245
+    assert metatlas_dataset.rts[0].rt_max == 2.4439
+    assert metatlas_dataset.data[0][0]["identification"].ms1_notes == "remove"
+    assert (
+        metatlas_dataset.data[0][0]["identification"].ms2_notes
+        == "1, co-isolated precursor but all reference ions are in sample spectrum"
+    )
+
+
+def test_generate_all_outputs01(metatlas_dataset, mocker):
+    mocker.patch(
+        "metatlas.plots.dill2plots.get_msms_hits",
+        return_value=pd.DataFrame(
+            {
+                "score": [],
+                "inchi_key": [],
+                "measured_precursor_mz": [],
+                "precursor_mz": [],
+                "file_name": [],
+                "msv_query_aligned": [],
+                "msv_ref_aligned": [],
+            },
+            index=pd.MultiIndex.from_tuples([], names=["msms_scan"]),
+        ),
+    )
+    metatlas_dataset.generate_all_outputs()
+    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 12
+    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 23
+
+
+# 49, 51, 53, 55, 57, 79-80, 174, 198-214, 233-234, 576, 581, 672-682, 686-688, 691, 702-708
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 22178377..a2475d86 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -53,3 +53,61 @@ def test_set_nested_term_dict(metatlas_dataset):
 def test_set_nested_term_no_ids(metatlas_dataset):
     with pytest.raises(ValueError):
         gdhf.set_nested(metatlas_dataset, [], "foobar")
+
+
+def test_extract_list01():
+    assert gdhf.extract([1, 2], [0]) == 1
+
+
+def test_extract_list02():
+    assert gdhf.extract([1, 2], [5]) is None
+
+
+def test_extract_list03():
+    assert gdhf.extract([1, 2], [5], 99) == 99
+
+
+def test_extract_list04():
+    assert gdhf.extract([], [0], 99) == 99
+
+
+def test_extract_list05():
+    assert gdhf.extract("okay", [], 99) == "okay"
+
+
+def test_extract_tuple01():
+    assert gdhf.extract(("foo", "bar"), [1]) == "bar"
+
+
+def test_extract_tuple02():
+    assert gdhf.extract(("foo", "bar"), [-1]) == "bar"
+
+
+def test_extract_tuple03():
+    assert gdhf.extract(("foo", "bar"), [9], "zoop") == "zoop"
+
+
+def test_extract_dict01():
+    assert gdhf.extract({"foo": 1, "bar": 2}, ["bar"]) == 2
+
+
+def test_extract_dict02():
+    assert gdhf.extract({"foo": 1, "bar": 2}, ["blah"], "zoop") == "zoop"
+
+
+def test_extract_default01():
+    assert gdhf.extract({"foo": 1, "bar": 2}, ["foo", "bar"], "zoop") == "zoop"
+
+
+def test_extract_metatlas_dataset01(metatlas_dataset):
+    ids = [0, 0, "identification", ("compound",), 0, ("inchi_key",)]
+    assert gdhf.extract(metatlas_dataset, ids, "zoop") == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+
+
+def test_extract_metatlas_dataset02(metatlas_dataset):
+    ids = [0, 0, "identification", "compound", 0, "inchi_key"]
+    assert gdhf.extract(metatlas_dataset, ids, "zoop") == "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+
+
+def test_extract_metatlas_dataset03(metatlas_dataset):
+    assert gdhf.extract(metatlas_dataset, ["foo"], "zoop") == "zoop"
diff --git a/tests/unit/test_targeted_output.py b/tests/unit/test_targeted_output.py
new file mode 100644
index 00000000..2e84743c
--- /dev/null
+++ b/tests/unit/test_targeted_output.py
@@ -0,0 +1,9 @@
+""" unit testing of targeted_output functions """
+# pylint: disable=missing-function-docstring
+
+from metatlas.io import targeted_output
+
+
+def test_write_msms_fragment_ions01(metatlas_dataset):
+    out = targeted_output.write_msms_fragment_ions(metatlas_dataset, min_mz=100, max_mz_offset=0.5)
+    assert out.loc[0, "spectrum"] == "[[252.11, 252.16], [100000, 7912]]"
diff --git a/tests/unit/test_write_utils.py b/tests/unit/test_write_utils.py
new file mode 100644
index 00000000..a7521241
--- /dev/null
+++ b/tests/unit/test_write_utils.py
@@ -0,0 +1,86 @@
+""" unit testing of write_utils functions """
+# pylint: disable=missing-function-docstring
+import os
+import pytest
+
+import pandas
+from metatlas.io import write_utils
+
+
+def test_make_dir_for01(mocker):
+    mocker.patch("os.makedirs")
+    write_utils.make_dir_for("foo/bar")
+    os.makedirs.assert_called_with("foo", exist_ok=True)  # pylint: disable=no-member
+
+
+def test_make_dir_for02(mocker):
+    mocker.patch("os.makedirs")
+    write_utils.make_dir_for("bar")
+    assert not os.makedirs.called  # pylint: disable=no-member
+
+
+def test_check_existing_file01(mocker):
+    mocker.patch("os.path.exists", return_value=True)
+    with pytest.raises(FileExistsError):
+        write_utils.check_existing_file("exists_file.txt")
+
+
+def test_check_existing_file02(mocker):
+    mocker.patch("os.path.exists", return_value=False)
+    write_utils.check_existing_file("does_not_exist_file.txt")
+    # Should not raise an error. No assert needed.
+
+
+def test_export_dataframe01(mocker):
+    mocker.patch("pandas.DataFrame.to_csv")
+    dataframe = pandas.DataFrame({1: [10], 2: [20]})
+    write_utils.export_dataframe(dataframe, "foo/bar", "test")
+    assert pandas.DataFrame.to_csv.called  # pylint: disable=no-member
+
+
+def test_raise_on_diff01(mocker):
+    mocker.patch("os.path.exists", return_value=False)
+    dataframe = pandas.DataFrame({1: [10], 2: [20]})
+    write_utils.raise_on_diff(dataframe, "foo/bar", "test")
+    # Should not raise an error. No assert needed.
+
+
+def test_raise_on_diff02(mocker):
+    mocker.patch("os.path.exists", return_value=True)
+    dataframe = pandas.DataFrame({1: [10], 2: [20]})
+    mocker.patch("pandas.read_csv", return_value=dataframe)
+    write_utils.raise_on_diff(dataframe, "foo/bar", "test")
+    # Should not raise an error. No assert needed.
+
+
+def test_raise_on_diff03(mocker):
+    mocker.patch("os.path.exists", return_value=True)
+    existing = pandas.DataFrame({1: [10], 2: [20]})
+    mocker.patch("pandas.read_csv", return_value=existing)
+    to_write = pandas.DataFrame({1: [10], 2: [99]})
+    with pytest.raises(ValueError):
+        write_utils.raise_on_diff(to_write, "foo/bar", "test")
+
+
+def test_export_dataframe_die_on_diff01(mocker):
+    mocker.patch("os.path.exists", return_value=False)
+    dataframe = pandas.DataFrame({1: [10], 2: [20]})
+    write_utils.export_dataframe_die_on_diff(dataframe, "foo/bar", "test")
+    # Should not raise an error. No assert needed.
+
+
+def test_export_dataframe_die_on_diff02(mocker):
+    mocker.patch("os.path.exists", return_value=True)
+    dataframe = pandas.DataFrame({1: [10], 2: [20]})
+    mocker.patch("pandas.read_csv", return_value=dataframe)
+    write_utils.export_dataframe_die_on_diff(dataframe, "foo/bar", "test")
+    # Should not raise an error. No assert needed.
+
+
+def test_export_dataframe_die_on_diff03(mocker):
+    mocker.patch("os.path.exists", return_value=True)
+    existing = pandas.DataFrame({1: [10], 2: [20]})
+    mocker.patch("pandas.read_csv", return_value=existing)
+    to_write = pandas.DataFrame({1: [10], 2: [99]})
+    with pytest.raises(ValueError):
+        write_utils.export_dataframe_die_on_diff(to_write, "foo/bar", "test")

From 9c48aa7a366a4b37ef5991bcb487525531a60593 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 2 Jun 2021 20:28:47 -0700
Subject: [PATCH 009/177] Tell user to reload page on kernel install

---
 1                          | 82 ++++++++++++++++++++++++++++++++++++++
 metatlas/tools/notebook.py |  1 +
 2 files changed, 83 insertions(+)
 create mode 100644 1

diff --git a/1 b/1
new file mode 100644
index 00000000..de6d57b8
--- /dev/null
+++ b/1
@@ -0,0 +1,82 @@
+"""Jupyter notebook helper functions"""
+
+import logging
+import os
+import shutil
+import sys
+
+from pathlib import Path
+import pandas as pd
+from IPython.core.display import display, HTML
+from metatlas.tools.logging import activate_logging
+
+logger = logging.getLogger(__name__)
+
+
+def configure_environment(log_level):
+    """
+    Sets environment variables and configures logging
+    inputs:
+        log_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    """
+    activate_logging(console_level=log_level)
+    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+
+
+def validate_kernel():
+    """
+    Raise error if problem with kernel
+    When on NERSC, this will install the correct kernel if needed
+    """
+    allowed_exe = [
+        "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
+    ]
+    error_msg = "Invalid kernel setting in Jupyter Notebook."
+    on_nersc = "METATLAS_LOCAL" not in os.environ
+    if on_nersc and sys.executable not in allowed_exe:
+        install_kernel()
+        logger.critical('Please check that the kernel is set to "Metatlas Targeted".')
+        raise ValueError(error_msg)
+    try:
+        # pylint: disable=import-outside-toplevel,unused-import
+        import dataset  # noqa: F401
+    except ModuleNotFoundError as module_error:
+        logger.critical(
+            'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
+        )
+        raise ModuleNotFoundError from module_error
+
+
+def install_kernel():
+    """
+    Copies kernel.json from repo to active location under home directory.
+    Only for use on NERC!
+    """
+    logger.info('Installing kernel.json for "Metatlas Targeted".')
+    repo_path = Path(__file__).resolve().parent.parent.parent
+    source = repo_path / "notebooks" / "kernels" / "metatlas-targeted.kernel.json"
+    dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
+    os.makedirs(dest_dir, exist_ok=True)
+    shutil.copyfile(source, dest_dir / "kernel.json")
+    logger.info('Reload the page and change kernel to "Metatlas Targeted".')
+
+
+def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
+    """Set pandas display options"""
+    pd.set_option("display.max_rows", max_rows)
+    pd.set_option("display.max_columns", max_columns)
+    pd.set_option("display.max_colwidth", max_colwidth)
+
+
+def configure_notebook_display():
+    """Configure output from Jupyter"""
+    # set notebook to have minimal side margins
+    display(HTML("<style>.container { width:100% !important; }</style>"))
+
+
+def setup(log_level):
+    """High level function to prepare the metatlas notebook"""
+    validate_kernel()
+    configure_environment(log_level)
+    configure_notebook_display()
+    configure_pandas_display()
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 56bad65d..de6d57b8 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -58,6 +58,7 @@ def install_kernel():
     dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
     os.makedirs(dest_dir, exist_ok=True)
     shutil.copyfile(source, dest_dir / "kernel.json")
+    logger.info('Reload the page and change kernel to "Metatlas Targeted".')
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):

From 6910160e80d445253a807d2d1f96c6a88b1d79c0 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 2 Jun 2021 20:31:15 -0700
Subject: [PATCH 010/177] Remove accidently commited junk file

---
 1 | 82 ---------------------------------------------------------------
 1 file changed, 82 deletions(-)
 delete mode 100644 1

diff --git a/1 b/1
deleted file mode 100644
index de6d57b8..00000000
--- a/1
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Jupyter notebook helper functions"""
-
-import logging
-import os
-import shutil
-import sys
-
-from pathlib import Path
-import pandas as pd
-from IPython.core.display import display, HTML
-from metatlas.tools.logging import activate_logging
-
-logger = logging.getLogger(__name__)
-
-
-def configure_environment(log_level):
-    """
-    Sets environment variables and configures logging
-    inputs:
-        log_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
-    """
-    activate_logging(console_level=log_level)
-    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
-
-
-def validate_kernel():
-    """
-    Raise error if problem with kernel
-    When on NERSC, this will install the correct kernel if needed
-    """
-    allowed_exe = [
-        "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
-    ]
-    error_msg = "Invalid kernel setting in Jupyter Notebook."
-    on_nersc = "METATLAS_LOCAL" not in os.environ
-    if on_nersc and sys.executable not in allowed_exe:
-        install_kernel()
-        logger.critical('Please check that the kernel is set to "Metatlas Targeted".')
-        raise ValueError(error_msg)
-    try:
-        # pylint: disable=import-outside-toplevel,unused-import
-        import dataset  # noqa: F401
-    except ModuleNotFoundError as module_error:
-        logger.critical(
-            'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
-        )
-        raise ModuleNotFoundError from module_error
-
-
-def install_kernel():
-    """
-    Copies kernel.json from repo to active location under home directory.
-    Only for use on NERC!
-    """
-    logger.info('Installing kernel.json for "Metatlas Targeted".')
-    repo_path = Path(__file__).resolve().parent.parent.parent
-    source = repo_path / "notebooks" / "kernels" / "metatlas-targeted.kernel.json"
-    dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
-    os.makedirs(dest_dir, exist_ok=True)
-    shutil.copyfile(source, dest_dir / "kernel.json")
-    logger.info('Reload the page and change kernel to "Metatlas Targeted".')
-
-
-def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
-    """Set pandas display options"""
-    pd.set_option("display.max_rows", max_rows)
-    pd.set_option("display.max_columns", max_columns)
-    pd.set_option("display.max_colwidth", max_colwidth)
-
-
-def configure_notebook_display():
-    """Configure output from Jupyter"""
-    # set notebook to have minimal side margins
-    display(HTML("<style>.container { width:100% !important; }</style>"))
-
-
-def setup(log_level):
-    """High level function to prepare the metatlas notebook"""
-    validate_kernel()
-    configure_environment(log_level)
-    configure_notebook_display()
-    configure_pandas_display()

From ef8dbc91e5c4cbf5ecd524b25250dfa0125fea82 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 2 Jun 2021 20:46:46 -0700
Subject: [PATCH 011/177] Change order of environment setup

---
 metatlas/tools/notebook.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index de6d57b8..ebe9a536 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -76,7 +76,7 @@ def configure_notebook_display():
 
 def setup(log_level):
     """High level function to prepare the metatlas notebook"""
-    validate_kernel()
     configure_environment(log_level)
+    validate_kernel()
     configure_notebook_display()
     configure_pandas_display()

From 9845a0566e1c96ae72ce0ac6438e8ddd9f6db053 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 2 Jun 2021 20:54:20 -0700
Subject: [PATCH 012/177] Add better instructions on changing kernels

---
 metatlas/tools/notebook.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index ebe9a536..ef3b5ab5 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -58,7 +58,9 @@ def install_kernel():
     dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
     os.makedirs(dest_dir, exist_ok=True)
     shutil.copyfile(source, dest_dir / "kernel.json")
-    logger.info('Reload the page and change kernel to "Metatlas Targeted".')
+    logger.info(('Reload the page and then change kernel to "Metatlas Targeted". '
+                 "On the menu bar at the top of this page select 'Kernel'>'Change Kernel..' "
+                 "then find 'Metatlas Targeted' in the drop down list."))
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):

From 21c26dda67550718d0d9f2fc3b78514584e9ad58 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 4 Jun 2021 11:25:50 -0700
Subject: [PATCH 013/177] WIP - fix raise_on_diff and logging

---
 metatlas/datastructures/metatlas_dataset.py | 18 ++++++++++--------
 metatlas/io/write_utils.py                  | 18 ++++++++++++------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 947ef352..082c106e 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -179,7 +179,7 @@ def write_data_source_files(self):
             logger.warning(
                 (
                     "Data sources directory already populated from previous work on this analysis. "
-                    "Not overwritting."
+                    "Not overwriting."
                 )
             )
         else:
@@ -191,11 +191,13 @@ def write_data_source_files(self):
 
     def write_lcmsruns_short_names(self):
         """Write short names and raise error if exists and differs from current data"""
+        short_names = self.lcmsruns_short_names
+        short_names['full_filename'] = short_names.index
         write_utils.export_dataframe_die_on_diff(
-            self.lcmsruns_short_names,
+            short_names,
             os.path.join(self.ids.output_dir, "short_names.csv"),
             "LCMS runs short names",
-            # index=True,
+            index=False,
         )
 
     def _get_atlas(self):
@@ -522,10 +524,9 @@ def set_data(self, ids, value):
     def rts(self):
         """
         Allow Rt_Reference objects to be accessed
-        Returns cloned RtReference objects, so modifing them will not impact data in this class.
         use set_rt() if you want to modify the RT values held by this class.
         """
-        return tuple(cid.rt_references[0].clone() for cid in self.atlas.compound_identifications)
+        return tuple(cid.rt_references[0] for cid in self.atlas.compound_identifications)
 
     def set_rt(self, compound_idx, which, time):
         """
@@ -600,7 +601,7 @@ def get_lcmsruns_short_names(self, fields=None):
                 "short_filename": [0, 2, 4, 5, 7, 9, 14],
                 "short_samplename": [9, 12, 13, 14],
             }
-        out = pd.DataFrame(columns=fields.keys())
+        out = pd.DataFrame(columns=fields.keys(), dtype="string")
         for i, lcms_file in enumerate(self.lcmsruns):
             tokens = lcms_file.name.split(".")[0].split("_")
             for name, idxs in fields.items():
@@ -610,7 +611,7 @@ def get_lcmsruns_short_names(self, fields=None):
         out.drop(columns=["last_modified"], inplace=True)
         out.drop_duplicates(subset=["full_filename"], keep="last", inplace=True)
         out.set_index("full_filename", inplace=True)
-        return out
+        return out.sort_values(by="full_filename")
 
     lcmsruns_short_names = property(get_lcmsruns_short_names)
 
@@ -679,7 +680,7 @@ def store_groups(self, exist_ok=False):
             new_names = set(self.groups_dataframe["group"].to_list())
             overlap = db_names.intersection(new_names)
             if overlap:
-                logging.error(
+                logger.error(
                     "Not saving groups as you have already saved groups with these names: %s.",
                     ", ".join(overlap),
                 )
@@ -795,6 +796,7 @@ def _set_nested(data, ids, value):
 
 
 def _error_if_bad_idxs(dataframe, test_idx_list):
+    """Raise IndexError if any members of of test_idx_list are not in dataframe's index"""
     bad = set(test_idx_list) - set(dataframe.index)
     if len(bad) > 0:
         raise IndexError(f"Invalid index values: {bad}.")
diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index 06e6c7b8..d31a2b47 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -1,7 +1,9 @@
 """ Utility functions used in writing files"""
 
+import filecmp
 import logging
 import os
+import tempfile
 
 import pandas as pd
 
@@ -46,15 +48,19 @@ def raise_on_diff(dataframe, file_path, description, **kwargs):
         dataframe: pandas DataFrame to save
         file_path: string with path of file to compare against
         description: free string for logging
-        remaining arguments are passed through to read_csv()
+        kwargs: passed through to to_csv()
 
-    If file_path exists and does not match dataframe then raise ValueError
+    If file_path exists and does not match file that would be generated by 
+    saving dataframe to a csv, then raise ValueError
     """
     if not os.path.exists(file_path):
         return
-    existing_df = pd.read_csv(file_path, **kwargs)
-    if dataframe.equals(existing_df):
-        logging.info("Data in %s is the same as %s.", description, file_path)
+    temp_path= tempfile.NamedTemporaryFile(delete=False)
+    dataframe.to_csv(temp_path, **kwargs)
+    same = filecmp.cmp(file_path, temp_path.name)
+    os.remove(temp_path.name)
+    if same:
+        logger.info("Data in %s is the same as %s.", description, file_path)
     else:
         try:
             raise ValueError("Data in %s is not the same as %s." % (description, file_path))
@@ -69,7 +75,7 @@ def export_dataframe_die_on_diff(dataframe, file_path, description, **kwargs):
         dataframe: pandas DataFrame to save
         file_path: string with path of file to create
         description: free string for logging
-        remaining arguments are passed through to to_csv()
+        kwargs: passed through to to_csv()
 
     If file_path does not exist then save the dataframe there
     If file_path exists and matches data in dataframe then do nothing

From 1ba88f9d09b1feb0a03fc50d46ababded260493a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 4 Jun 2021 15:33:12 -0700
Subject: [PATCH 014/177] WIP - do not default to renaming atlas on filter

---
 metatlas/datastructures/metatlas_dataset.py | 11 ++++++-----
 tests/unit/test_metatlas_dataset.py         |  9 +++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 082c106e..23574bc3 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -254,7 +254,7 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
             keep_idxs: the indexes of compounds to keep
             remove_idxs: the indexes of compounds to remove
                 Exactly one of keep_idxs or remove_idxs must be None
-            name: the name for the new atlas, defaults to current name + '_compound_filtered'
+            name: the name for the new atlas, defaults to current name
         output:
             If keep_idxs is not None then update self.atlas to contain only the compound_identifications at
             keep_idxs. If remove_idxs is not None then update self.atlas to contain only the compound
@@ -273,7 +273,7 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
             keep_idxs = self.atlas_df.index.difference(remove_idxs)
         self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
         self._atlas_df_valid = True
-        name = f"{self.atlas.name}_compound_filtered" if name is None else name
+        name = self.atlas.name if name is None else name
         mz_tolerance = self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
         if self._data_valid:
             self._data = [
@@ -318,7 +318,7 @@ def filter_hits_by_atlas(self):
     def filter_compounds_ms1_notes_remove(self, name=None):
         """
         inputs:
-            name: the name for the new atlas, defaults to current name + '_kept'
+            name: the name for the new atlas, defaults to current name
         output:
             updates self.atlas to contain only the compound_identifications that do not have ms1_notes
             starting with 'remove' (case insensitive)
@@ -326,7 +326,7 @@ def filter_compounds_ms1_notes_remove(self, name=None):
             get their value from self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
         """
         logger.debug("Filtering atlas to exclude ms1_notes=='remove'.")
-        name = f"{self.atlas.name}_kept" if name is None else name
+        name = self.atlas.name if name is None else name
         self.filter_compounds(remove_idxs=self.compound_indices_marked_remove(), name=name)
 
     def filter_compounds_by_signal(self, num_points, peak_height, name=None):
@@ -336,9 +336,10 @@ def filter_compounds_by_signal(self, num_points, peak_height, name=None):
                         in order for the compound to remain in the atlas
             peak_height: max intensity in the EIC that must be exceeded in one or more samples
                          in order for the compound to remain in the atlas
+            name: the name for the new atlas, defaults to current name
         """
         logger.debug("Filtering atlas on num_points=%d, peak_height=%d.")
-        name = f"{self.atlas.name}_strong" if name is None else name
+        name = self.atlas.name if name is None else name
         keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
         self.filter_compounds(keep_idxs=keep_idxs, name=name)
 
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 643cadd9..0804ab79 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -116,6 +116,15 @@ def test_rts02(metatlas_dataset):
     assert len(metatlas_dataset.rts) == 1
 
 
+def test_rts03(metatlas_dataset, analysis_ids):
+    metatlas_dataset.set_rt(0, "rt_max", 9.99)
+    metob.store(metatlas_dataset.atlas)
+    atlas_from_db = metob.retrieve('Atlas', unique_id=metatlas_dataset.atlas.unique_id)[0]
+    second_metatlas_dataset = mads.MetatlasDataset(analysis_ids, atlas_from_db)
+    assert second_metatlas_dataset.rts[0].rt_max == 9.99
+    assert len(second_metatlas_dataset.rts) == 1
+
+
 def test_set_note01(metatlas_dataset):
     metatlas_dataset.set_note(0, "ms2_notes", "Foobar")
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "Foobar"

From c2008f012753fd3f4e9ab9a904ac53e5ffead5dd Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 7 Jun 2021 09:34:07 -0700
Subject: [PATCH 015/177] WIP - update dev docs to remove conda dependency

---
 DEVELOP.rst | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/DEVELOP.rst b/DEVELOP.rst
index 48d1a7b9..a2b8cab1 100644
--- a/DEVELOP.rst
+++ b/DEVELOP.rst
@@ -6,11 +6,10 @@ Setup
 
 1. Install Python 3.8+ (`pyenv <https://github.com/pyenv/pyenv>`_ and `pyenv intstaller <https://github.com/pyenv/pyenv-installer>`_ can help here)
 2. Install `Pip <https://pip.pypa.io/en/stable/installing/>`_
-3. Install a conda environment manager. You can get miniconda `here <https://docs.conda.io/en/latest/miniconda.html>`_.
-4. Install `Docker <https://docs.docker.com/get-docker/>`_.
-5. Install Nox with :code:`pip install --user --upgrade nox`
-6. :code:`git clone https://github.com/biorack/metatlas.git`
-7. Install git pre-commit hooks with :code:`cd metatlas && nox -s install_git_hooks`
+3. Install `Docker <https://docs.docker.com/get-docker/>`_.
+4. Install Nox with :code:`pip install --user --upgrade nox`
+5. :code:`git clone https://github.com/biorack/metatlas.git`
+6. Install git pre-commit hooks with :code:`cd metatlas && nox -s install_git_hooks`
 
 Local Development
 #################

From 15f334b1e4bfe225e6ae536f68c2a627dd6c0bd1 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 8 Jun 2021 14:24:29 -0700
Subject: [PATCH 016/177] WIP - add source_atlas to MetatlasDataset

---
 metatlas/datastructures/metatlas_dataset.py   | 113 +++++++++++-------
 metatlas/datastructures/object_helpers.py     |   2 +
 metatlas/io/write_utils.py                    |  10 +-
 metatlas/tools/notebook.py                    |  10 +-
 notebooks/reference/Targeted.ipynb            |  12 +-
 .../fixtures/analysis_identifiers_fixtures.py |   5 +-
 tests/fixtures/database.py                    |  11 +-
 tests/fixtures/metatlas_dataset_fixtures.py   |   8 +-
 tests/system/test_targeted.py                 |   1 +
 tests/unit/test_metatlas_dataset.py           |  66 +++++++---
 tests/unit/test_write_utils.py                |  13 +-
 11 files changed, 172 insertions(+), 79 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 23574bc3..23d7c069 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -26,18 +26,26 @@ class AnalysisIdentifiers:
 
     # pylint: disable=too-many-arguments
     def __init__(
-        self, experiment, output_type, polarity, analysis_number, project_directory, atlas=None, username=None
+        self,
+        source_atlas,
+        experiment,
+        output_type,
+        polarity,
+        analysis_number,
+        project_directory,
+        username=None,
     ):
+        self._source_atlas = source_atlas
         self._experiment = experiment
         self._output_type = output_type
         self._polarity = polarity
         self._analysis_number = analysis_number
-        self._atlas = atlas
         self._username = getpass.getuser() if username is None else username
         self.project_directory = project_directory
         self.validate()
         logger.info(
-            "IDs: atlas=%s, short_experiment_analysis=%s, output_dir=%s",
+            "IDs: source_atlas=%s, atlas=%s, short_experiment_analysis=%s, output_dir=%s",
+            self.source_atlas,
             self.atlas,
             self.short_experiment_analysis,
             self.output_dir,
@@ -45,6 +53,7 @@ def __init__(
 
     def validate(self):
         """Valid class inputs"""
+        get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
             raise ValueError('Parameter experiment does contain 9 fields when split on "_".')
         if self.output_type not in ["ISTDsEtc", "FinalEMA-HILIC"]:
@@ -56,6 +65,11 @@ def validate(self):
         if self.analysis_number < 0:
             raise ValueError("Parameter analysis_number cannot be negative.")
 
+    @property
+    def source_atlas(self):
+        """Returns source atlas identifier"""
+        return self._source_atlas
+
     @property
     def experiment(self):
         """Returns experiment identifier"""
@@ -79,10 +93,8 @@ def analysis_number(self):
     @property
     def atlas(self):
         """Atlas identifier (name)"""
-        if self._atlas is None:
-            exp_tokens = self.experiment.split("_")
-            return f"{'_'.join(exp_tokens[3:6])}_{self.short_polarity}_{self.analysis_number}"
-        return self._atlas
+        exp_tokens = self.experiment.split("_")
+        return f"{'_'.join(exp_tokens[3:6])}_{self.short_polarity}_{self.analysis}"
 
     @property
     def username(self):
@@ -134,7 +146,6 @@ class MetatlasDataset:
     def __init__(
         self,
         ids,
-        atlas=None,
         groups_controlled_vocab=None,
         exclude_files=None,
         extra_time=0.75,
@@ -152,7 +163,6 @@ def __init__(
             exclude_files: array of strings that will exclude files if they are substrings of the filename
         """
         self.ids = ids
-        self._atlas = self._get_atlas() if atlas is None else atlas
         self._atlas_df = None
         self._atlas_df_valid = False
         self._runs = None
@@ -169,6 +179,7 @@ def __init__(
         self._frag_mz_tolerance = frag_mz_tolerance
         self._msms_refs_loc = msms_refs_loc
         self.max_cpus = max_cpus
+        self._get_atlas()
         self.write_data_source_files()
         self.write_lcmsruns_short_names()
 
@@ -192,7 +203,7 @@ def write_data_source_files(self):
     def write_lcmsruns_short_names(self):
         """Write short names and raise error if exists and differs from current data"""
         short_names = self.lcmsruns_short_names
-        short_names['full_filename'] = short_names.index
+        short_names["full_filename"] = short_names.index
         write_utils.export_dataframe_die_on_diff(
             short_names,
             os.path.join(self.ids.output_dir, "short_names.csv"),
@@ -201,24 +212,11 @@ def write_lcmsruns_short_names(self):
         )
 
     def _get_atlas(self):
-        """Load atlas from database"""
-        name_query = f"%_{self.ids.short_polarity}_{self.ids.short_experiment_analysis}"
-        atlases = metob.retrieve("atlases", name=name_query, username=self.ids.username)
-        if len(atlases) == 0:
-            logger.error(
-                'Database does not contain an atlas named "%s" and owned by %s.',
-                name_query,
-                self.ids.username,
-            )
-            raise ValueError("Atlas not found in database")
-        if len(atlases) > 1:
-            logger.error(
-                'Database contains more than one atlas named "%s" and owned by %s.',
-                name_query,
-                self.ids.username,
-            )
-            raise ValueError("Too many matching atlases found in database")
-        return atlases[0]
+        """Copy source atlas from database into current analysis atlas"""
+        source = get_atlas(self.ids.source_atlas, self.ids.username)
+        self._atlas = source.clone()
+        self._atlas.name = self.ids.atlas
+        self._atlas_valid = True
 
     def _build(self):
         """Populate self._data from database and h5 files."""
@@ -355,8 +353,12 @@ def store_atlas(self, name=None, even_if_exists=False):
         """
         name = self.atlas.name if name is None else name
         username = getpass.getuser()
-        if not even_if_exists and len(metob.retrieve("atlases", name=name, username=username)) > 0:
-            raise ValueError(f"An atlas with name {name} and owned by {username} already exists.")
+        try:
+            if not even_if_exists and len(metob.retrieve("Atlas", name=name, username=username)) > 0:
+                raise ValueError(f"An atlas with name {name} and owned by {username} already exists.")
+        except ValueError as err:
+            logger.exception(err)
+            raise err
         metob.store(self.atlas)
 
     def export_atlas_to_csv(self, filename=None):
@@ -602,7 +604,7 @@ def get_lcmsruns_short_names(self, fields=None):
                 "short_filename": [0, 2, 4, 5, 7, 9, 14],
                 "short_samplename": [9, 12, 13, 14],
             }
-        out = pd.DataFrame(columns=fields.keys(), dtype="string")
+        out = pd.DataFrame(columns=fields.keys())
         for i, lcms_file in enumerate(self.lcmsruns):
             tokens = lcms_file.name.split(".")[0].split("_")
             for name, idxs in fields.items():
@@ -680,12 +682,15 @@ def store_groups(self, exist_ok=False):
             db_names = {group.name for group in self.existing_groups}
             new_names = set(self.groups_dataframe["group"].to_list())
             overlap = db_names.intersection(new_names)
-            if overlap:
-                logger.error(
-                    "Not saving groups as you have already saved groups with these names: %s.",
-                    ", ".join(overlap),
-                )
-                raise ValueError("Existing group has same name.")
+            try:
+                if overlap:
+                    raise ValueError(
+                        "Not saving groups as you have already saved groups with these names: %s.",
+                        ", ".join(overlap),
+                    )
+            except ValueError as err:
+                logger.exception(err)
+                raise err
         metob.store(self.groups)
 
     def compound_idxs_not_evaluated(self):
@@ -778,8 +783,12 @@ def _set_nested(data, ids, value):
     as: ('attribute_name',). If you want to make it more explict to the reader, you can add a
     second member to the tuple, which will not be used, such as ('attribute_name', 'as attribute')
     """
-    if len(ids) == 0:
-        raise ValueError("ids cannot be empty")
+    try:
+        if len(ids) == 0:
+            raise ValueError("ids cannot be empty")
+    except ValueError as err:
+        logger.exception(err)
+        raise err
     if len(ids) == 1:
         if isinstance(ids[0], tuple):
             setattr(data, ids[0][0], value)
@@ -799,5 +808,27 @@ def _set_nested(data, ids, value):
 def _error_if_bad_idxs(dataframe, test_idx_list):
     """Raise IndexError if any members of of test_idx_list are not in dataframe's index"""
     bad = set(test_idx_list) - set(dataframe.index)
-    if len(bad) > 0:
-        raise IndexError(f"Invalid index values: {bad}.")
+    try:
+        if len(bad) > 0:
+            raise IndexError(f"Invalid index values: {bad}.")
+    except IndexError as err:
+        logger.exception(err)
+        raise err
+
+
+def get_atlas(name, username):
+    """Load atlas from database"""
+    atlases = metob.retrieve("Atlas", name=name, username=username)
+    try:
+        if len(atlases) == 0:
+            raise ValueError(f'Database does not contain an atlas named "{name}" and owned by {username}.')
+    except ValueError as err:
+        logger.exception(err)
+        raise err
+    try:
+        if len(atlases) > 1:
+            raise ValueError(f'Database contains more than one atlas named "{name}" and owned by {username}.')
+    except ValueError as err:
+        logger.exception(err)
+        raise err
+    return atlases[0]
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 116de876..6df9752a 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -154,6 +154,8 @@ def __init__(self):
                     else:
                         login = f"{local_info['db_username']}@"
                 self.path = f"mysql+pymysql://{login}{hostname}/{local_info['db_name']}"
+            elif 'METATLAS_SQLITE' in os.environ:
+                self.path = 'sqlite:///' + os.environ['METATLAS_SQLITE']
             else:
                 self.path = 'sqlite:///' + getpass.getuser() + '_workspace.db'
 
diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index d31a2b47..5868a3c5 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -50,15 +50,15 @@ def raise_on_diff(dataframe, file_path, description, **kwargs):
         description: free string for logging
         kwargs: passed through to to_csv()
 
-    If file_path exists and does not match file that would be generated by 
+    If file_path exists and does not match file that would be generated by
     saving dataframe to a csv, then raise ValueError
     """
     if not os.path.exists(file_path):
         return
-    temp_path= tempfile.NamedTemporaryFile(delete=False)
-    dataframe.to_csv(temp_path, **kwargs)
-    same = filecmp.cmp(file_path, temp_path.name)
-    os.remove(temp_path.name)
+    with tempfile.NamedTemporaryFile(delete=False) as temp_path:
+        dataframe.to_csv(temp_path, **kwargs)
+        same = filecmp.cmp(file_path, temp_path.name)
+        os.remove(temp_path.name)
     if same:
         logger.info("Data in %s is the same as %s.", description, file_path)
     else:
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index ef3b5ab5..e3c5f8b9 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -58,9 +58,13 @@ def install_kernel():
     dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
     os.makedirs(dest_dir, exist_ok=True)
     shutil.copyfile(source, dest_dir / "kernel.json")
-    logger.info(('Reload the page and then change kernel to "Metatlas Targeted". '
-                 "On the menu bar at the top of this page select 'Kernel'>'Change Kernel..' "
-                 "then find 'Metatlas Targeted' in the drop down list."))
+    logger.info(
+        (
+            'Reload the page and then change kernel to "Metatlas Targeted". '
+            "On the menu bar at the top of this page select 'Kernel'>'Change Kernel..' "
+            "then find 'Metatlas Targeted' in the drop down list."
+        )
+    )
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index f61666c4..a56a0b69 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -21,6 +21,16 @@
    "source": [
     "# pylint: disable=invalid-name,missing-module-docstring\n",
     "\n",
+    "# source atlas name\n",
+    "source_atlas = 'REPLACE ME'\n",
+    "\n",
+    "# this atlas will be copied to an atlas named projectId_experimentName_sampleSet_polarity_analysisId\n",
+    "# where projectId is JGI Proposal ID Number\n",
+    "#       experiment name is short text description from field 4 (0-indexed) of LCMS filename\n",
+    "#       sampleSet is commonly Pilot, Final - from field 5 (0-indexed) of LCMS filename\n",
+    "#       polarity is 'POS' or 'NEG'\n",
+    "#       analysisId is usernameX where X is the analysis number\n",
+    "\n",
     "# one of 'positive' or 'negative'\n",
     "polarity = \"positive\"\n",
     "\n",
@@ -138,7 +148,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ids = mads.AnalysisIdentifiers(experiment, output_type, polarity, analysis_number, project_directory)"
+    "ids = mads.AnalysisIdentifiers(source_atlas, experiment, output_type, polarity, analysis_number, project_directory)"
    ]
   },
   {
diff --git a/tests/fixtures/analysis_identifiers_fixtures.py b/tests/fixtures/analysis_identifiers_fixtures.py
index 90d305f2..5ac3ef25 100644
--- a/tests/fixtures/analysis_identifiers_fixtures.py
+++ b/tests/fixtures/analysis_identifiers_fixtures.py
@@ -6,13 +6,14 @@
 
 
 @pytest.fixture(name="analysis_ids")
-def fixture_analysis_ids(tmp_path):
+def fixture_analysis_ids(tmp_path, mocker):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
     return mads.AnalysisIdentifiers(
+        "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
         "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
         "FinalEMA-HILIC",
         "positive",
         0,
         str(tmp_path),
-        atlas="HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
         username="root",
     )
diff --git a/tests/fixtures/database.py b/tests/fixtures/database.py
index 0d7f1d20..7064831d 100644
--- a/tests/fixtures/database.py
+++ b/tests/fixtures/database.py
@@ -8,13 +8,12 @@
 
 
 @pytest.fixture(name="sqlite")
-def fixture_sqlite(tmp_path):
+def fixture_sqlite(tmp_path, monkeypatch):
     # make sure we don't accidently pollute the production MySQL DB
-    assert os.environ.get("METATLAS_LOCAL") == "TRUE"
-    os.chdir(tmp_path)  # don't reuse the sqlite DB
-    username = getpass.getuser()
-    sqlite3.connect(f"{username}_workspace.db").close()
+    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
+    db_path = tmp_path / "workspace.db"
+    monkeypatch.setenv("METATLAS_SQLITE", str(db_path))
+    sqlite3.connect(db_path).close()
     dummy = metob.Atlas()
     dummy.name = "this is a dummy atlas to initialize sqlite db"
     metob.store(dummy)
-    # do I need to store each type of object?
diff --git a/tests/fixtures/metatlas_dataset_fixtures.py b/tests/fixtures/metatlas_dataset_fixtures.py
index f97bd5ff..070eacfc 100644
--- a/tests/fixtures/metatlas_dataset_fixtures.py
+++ b/tests/fixtures/metatlas_dataset_fixtures.py
@@ -40,12 +40,13 @@ def fixture_groups_controlled_vocab():
 
 
 @pytest.fixture(name="metatlas_dataset")
-def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, atlas, lcmsrun):
+def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, atlas):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids, atlas, groups_controlled_vocab)
+    mocker.patch("metatlas.datastructures.metatlas_dataset.get_atlas", return_value=atlas)
+    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
 
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
@@ -56,7 +57,8 @@ def fixture_metatlas_dataset_with_2_cids(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids, atlas_with_2_cids, groups_controlled_vocab)
+    mocker.patch("metatlas.datastructures.metatlas_dataset.get_atlas", return_value=atlas_with_2_cids)
+    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
 
 
 @pytest.fixture(name="eic")
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index 328decf4..c72e8c57 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -51,6 +51,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
                                    "agui.data.set_rt(3, \\"rt_max\\", 3.3081)\\n" \
                                   ]' /src/notebooks/reference/Targeted.ipynb > /out/Remove.ipynb &&  \
                     papermill \
+                        -p source_atlas HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0 \
                         -p experiment 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 \
                         -p metatlas_repo_path /src \
                         -p project_directory /out \
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 0804ab79..766934f2 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -1,9 +1,10 @@
 """ tests for MetatasDataset """
-# pylint: disable=missing-function-docstring, protected-access
+# pylint: disable=missing-function-docstring,protected-access,unused-argument
 
 import datetime
 import glob
 import logging
+import os
 import pandas as pd
 import pytest
 from metatlas.datastructures import metatlas_dataset as mads
@@ -11,6 +12,20 @@
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 
 
+@pytest.fixture(scope="function", autouse=True)
+def change_test_dir(request):
+    os.chdir(request.fspath.dirname)
+    yield
+    os.chdir(request.config.invocation_dir)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def set_env_vars(tmp_path, monkeypatch):
+    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
+    db_path = tmp_path / "workspace.db"
+    monkeypatch.setenv("METATLAS_SQLITE", str(db_path))
+
+
 def test_metatlas_dataset_build01(metatlas_dataset):
     assert len(metatlas_dataset) == 1
     assert len(metatlas_dataset[0]) == 1
@@ -116,16 +131,16 @@ def test_rts02(metatlas_dataset):
     assert len(metatlas_dataset.rts) == 1
 
 
-def test_rts03(metatlas_dataset, analysis_ids):
+def test_rts03(metatlas_dataset, analysis_ids, sqlite):
     metatlas_dataset.set_rt(0, "rt_max", 9.99)
     metob.store(metatlas_dataset.atlas)
-    atlas_from_db = metob.retrieve('Atlas', unique_id=metatlas_dataset.atlas.unique_id)[0]
+    atlas_from_db = metob.retrieve("Atlas", unique_id=metatlas_dataset.atlas.unique_id)[0]
     second_metatlas_dataset = mads.MetatlasDataset(analysis_ids, atlas_from_db)
     assert second_metatlas_dataset.rts[0].rt_max == 9.99
     assert len(second_metatlas_dataset.rts) == 1
 
 
-def test_set_note01(metatlas_dataset):
+def test_set_note01(metatlas_dataset, sqlite):
     metatlas_dataset.set_note(0, "ms2_notes", "Foobar")
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "Foobar"
 
@@ -136,7 +151,7 @@ def test_set_note02(metatlas_dataset):
     assert metatlas_dataset[0][0]["identification"].ms1_notes == "keeper"
 
 
-def test_compound_indices_marked_remove01(metatlas_dataset):
+def test_compound_indices_marked_remove01(sqlite, metatlas_dataset):
     assert len(metatlas_dataset.compound_indices_marked_remove()) == 0
     metatlas_dataset.set_note(0, "ms1_notes", "REMOVE")
     assert len(metatlas_dataset.compound_indices_marked_remove()) == 1
@@ -320,10 +335,10 @@ def test_set_data01(metatlas_dataset):
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
 
 
-def test_store_atlas01(metatlas_dataset, sqlite):  # pylint: disable=unused-argument
+def test_store_atlas01(metatlas_dataset, sqlite):
     metatlas_dataset.atlas.name = "test_store_atlas01"
     metatlas_dataset.store_atlas()
-    atlas_list = metob.retrieve("atlases", name=metatlas_dataset.atlas.name, username="*")
+    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username="*")
     assert len(atlas_list) == 1
     metatlas_dataset.store_atlas(even_if_exists=True)
     with pytest.raises(ValueError):
@@ -333,6 +348,7 @@ def test_store_atlas01(metatlas_dataset, sqlite):  # pylint: disable=unused-argu
 def test_analysis_identifiers01():
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
+            "source_atlas_name_not_valid",
             "experiment_not_valid",
             "output_type_not_valid",
             "polarity_not_valid",
@@ -344,6 +360,7 @@ def test_analysis_identifiers01():
 def test_analysis_identifiers02():
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "output_type_not_valid",
             "polarity_not_valid",
@@ -355,6 +372,7 @@ def test_analysis_identifiers02():
 def test_analysis_identifiers03():
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "polarity_not_valid",
@@ -363,9 +381,11 @@ def test_analysis_identifiers03():
         )
 
 
-def test_analysis_identifiers04():
+def test_analysis_identifiers04(mocker):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
     with pytest.raises(TypeError):
         mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -374,9 +394,11 @@ def test_analysis_identifiers04():
         )
 
 
-def test_analysis_identifiers05():
+def test_analysis_identifiers05(mocker):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
     with pytest.raises(TypeError):
         mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -385,9 +407,11 @@ def test_analysis_identifiers05():
         )
 
 
-def test_analysis_identifiers06():
+def test_analysis_identifiers06(mocker):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -396,14 +420,26 @@ def test_analysis_identifiers06():
         )
 
 
+def test_analysis_identifiers07():
+    with pytest.raises(ValueError):
+        mads.AnalysisIdentifiers(
+            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            "experiemnt_name_not_valid",
+            "output_type_not_valid",
+            "polarity_not_valid",
+            "analysis_number_not_valid",
+            "/foo/bar",
+        )
+
+
 def test_analysis_identifiers_atlas01(analysis_ids):
-    assert analysis_ids.atlas == "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0"
+    assert analysis_ids.atlas == "505892_OakGall_final_POS_root0"
 
 
 def test_analysis_identifiers_atlas02(analysis_ids):
     # call .atlas twice to get cached value
     analysis_ids.atlas  # pylint: disable=pointless-statement
-    assert analysis_ids.atlas == "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0"
+    assert analysis_ids.atlas == "505892_OakGall_final_POS_root0"
 
 
 def test_write_data_source_files01(metatlas_dataset, mocker, caplog):
@@ -422,15 +458,15 @@ def test_write_data_source_files02(metatlas_dataset, mocker, caplog):
     assert ma_data.make_data_sources_tables.called  # pylint: disable=no-member
 
 
-def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun):
+def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0])
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[atlas])
     mocker.patch("glob.glob", return_value=range(10))
     metatlas_dataset = mads.MetatlasDataset(analysis_ids)
-    assert metatlas_dataset.atlas == 0
+    assert metatlas_dataset.atlas.name == "505892_OakGall_final_POS_root0"
 
 
 def test_get_atlas02(mocker, analysis_ids, caplog):
diff --git a/tests/unit/test_write_utils.py b/tests/unit/test_write_utils.py
index a7521241..082cf5cc 100644
--- a/tests/unit/test_write_utils.py
+++ b/tests/unit/test_write_utils.py
@@ -7,6 +7,13 @@
 from metatlas.io import write_utils
 
 
+@pytest.fixture(scope="function", autouse=True)
+def change_test_dir(request):
+    os.chdir(request.fspath.dirname)
+    yield
+    os.chdir(request.config.invocation_dir)
+
+
 def test_make_dir_for01(mocker):
     mocker.patch("os.makedirs")
     write_utils.make_dir_for("foo/bar")
@@ -33,6 +40,7 @@ def test_check_existing_file02(mocker):
 
 def test_export_dataframe01(mocker):
     mocker.patch("pandas.DataFrame.to_csv")
+    mocker.patch("os.path.exists", return_value=False)
     dataframe = pandas.DataFrame({1: [10], 2: [20]})
     write_utils.export_dataframe(dataframe, "foo/bar", "test")
     assert pandas.DataFrame.to_csv.called  # pylint: disable=no-member
@@ -48,15 +56,14 @@ def test_raise_on_diff01(mocker):
 def test_raise_on_diff02(mocker):
     mocker.patch("os.path.exists", return_value=True)
     dataframe = pandas.DataFrame({1: [10], 2: [20]})
-    mocker.patch("pandas.read_csv", return_value=dataframe)
+    mocker.patch("filecmp.cmp", return_value=True)
     write_utils.raise_on_diff(dataframe, "foo/bar", "test")
     # Should not raise an error. No assert needed.
 
 
 def test_raise_on_diff03(mocker):
     mocker.patch("os.path.exists", return_value=True)
-    existing = pandas.DataFrame({1: [10], 2: [20]})
-    mocker.patch("pandas.read_csv", return_value=existing)
+    mocker.patch("filecmp.cmp", return_value=False)
     to_write = pandas.DataFrame({1: [10], 2: [99]})
     with pytest.raises(ValueError):
         write_utils.raise_on_diff(to_write, "foo/bar", "test")

From bcd072e3625ddaf9d4104c9cc2597a5bf2ab30bc Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 10 Jun 2021 08:54:08 -0700
Subject: [PATCH 017/177] WIP - revamp test fixtures

---
 .gitignore                                    |    1 +
 metatlas/datastructures/metatlas_dataset.py   |   12 +-
 metatlas/datastructures/object_helpers.py     |  340 ++--
 metatlas/io/write_utils.py                    |    2 -
 notebooks/reference/Targeted.ipynb            |    6 +-
 tests/fixtures/__init__.py                    |    0
 .../fixtures/analysis_identifiers_fixtures.py |   19 -
 tests/fixtures/database.py                    |   19 -
 tests/fixtures/hdf5_fixtures.py               |  410 -----
 tests/fixtures/metatlas_dataset_fixtures.py   |  336 ----
 tests/fixtures/metatlas_object_fixtures.py    |  646 --------
 tests/unit/conftest.py                        | 1454 ++++++++++++++++-
 tests/unit/test_metatlas_dataset.py           |  140 +-
 tests/unit/test_write_utils.py                |   20 +-
 14 files changed, 1704 insertions(+), 1701 deletions(-)
 delete mode 100644 tests/fixtures/__init__.py
 delete mode 100644 tests/fixtures/analysis_identifiers_fixtures.py
 delete mode 100644 tests/fixtures/database.py
 delete mode 100644 tests/fixtures/hdf5_fixtures.py
 delete mode 100644 tests/fixtures/metatlas_dataset_fixtures.py
 delete mode 100644 tests/fixtures/metatlas_object_fixtures.py

diff --git a/.gitignore b/.gitignore
index cf68e21b..e1365ba5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,3 +90,4 @@ scratch/
 
 # editor swap files
 .*.swp
+.vscode/
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 23d7c069..5ce1f57c 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -163,6 +163,8 @@ def __init__(
             exclude_files: array of strings that will exclude files if they are substrings of the filename
         """
         self.ids = ids
+        self._atlas = None
+        self._atlas_valid = False
         self._atlas_df = None
         self._atlas_df_valid = False
         self._runs = None
@@ -424,7 +426,7 @@ def atlas(self):
     def atlas(self, atlas):
         """atlas setter, invalidate atlas_df and data"""
         if not isinstance(atlas, metob.Atlas):
-            raise TypeError("Cannot set atlas to container a non-Atlas object")
+            raise TypeError("Cannot set atlas to contain a non-Atlas object")
         self._set_and_invalidate_properties("atlas", atlas, ["atlas_df", "data"])
 
     @property
@@ -685,8 +687,8 @@ def store_groups(self, exist_ok=False):
             try:
                 if overlap:
                     raise ValueError(
-                        "Not saving groups as you have already saved groups with these names: %s.",
-                        ", ".join(overlap),
+                        "Not saving groups as you have already saved groups with these names: %s."
+                        % ", ".join(overlap),
                     )
             except ValueError as err:
                 logger.exception(err)
@@ -821,13 +823,13 @@ def get_atlas(name, username):
     atlases = metob.retrieve("Atlas", name=name, username=username)
     try:
         if len(atlases) == 0:
-            raise ValueError(f'Database does not contain an atlas named "{name}" and owned by {username}.')
+            raise ValueError(f'Database does not contain an atlas {name} owned by {username}.')
     except ValueError as err:
         logger.exception(err)
         raise err
     try:
         if len(atlases) > 1:
-            raise ValueError(f'Database contains more than one atlas named "{name}" and owned by {username}.')
+            raise ValueError(f'Database contains more than one atlas {name} owned by {username}.')
     except ValueError as err:
         logger.exception(err)
         raise err
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 6df9752a..c9f2956c 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -133,7 +133,6 @@ def __init__(self):
 
         host_name = socket.gethostname()
         #print("asdf you're running on %s at %s " % (host_name, socket.gethostbyname(socket.gethostname())))
-
         if ON_NERSC:
             with open(os.path.join(metatlas_dir, 'nersc_config', 'nersc.yml')) as fid:
                 nersc_info = yaml.safe_load(fid)
@@ -154,10 +153,11 @@ def __init__(self):
                     else:
                         login = f"{local_info['db_username']}@"
                 self.path = f"mysql+pymysql://{login}{hostname}/{local_info['db_name']}"
-            elif 'METATLAS_SQLITE' in os.environ:
-                self.path = 'sqlite:///' + os.environ['METATLAS_SQLITE']
             else:
-                self.path = 'sqlite:///' + getpass.getuser() + '_workspace.db'
+                filename = f"{getpass.getuser()}_workspace.db"
+                self.path = f"sqlite:///{filename}"
+                if os.path.exists(filename):
+                    os.chmod(filename, 0o775)
 
         self.tablename_lut = dict()
         self.subclass_lut = dict()
@@ -189,16 +189,14 @@ def get_connection(self):
                 self.db.query('SELECT name FROM sqlite_master WHERE type = "table"')
         except Exception:
             self.db = dataset.connect(self.path)
-            if 'sqlite' in self.path:
-                os.chmod(self.path[10:], 0o775)
 
     def convert_to_double(self, table, entry):
         """Convert a table column to double type."""
-        self.get_connection()
-        try:
-            self.db.query('alter table `%s` modify `%s` double' % (table, entry))
-        except Exception as e:
-            print(e)
+        with dataset.connect(self.path) as trans:
+            try:
+                trans.query('alter table `%s` modify `%s` double' % (table, entry))
+            except Exception as e:
+                print(e)
 
     def save_objects(self, objects, _override=False):
         """Save objects to the database"""
@@ -210,28 +208,30 @@ def save_objects(self, objects, _override=False):
         self._inserts = defaultdict(list)
         for obj in objects:
             self._get_save_data(obj, _override)
-        self.get_connection()
-        for (table_name, updates) in self._link_updates.items():
-            if table_name not in self.db:
-                continue
-            with self.db:
+        with dataset.connect(self.path) as trans:
+            for (table_name, updates) in self._link_updates.items():
+                if table_name not in trans:
+                    continue
                 for (uid, prev_uid) in updates:
-                    self.db.query('update `%s` set source_id = "%s" where source_id = "%s"' % (table_name, prev_uid, uid))
-        for (table_name, updates) in self._updates.items():
-            if '_' not in table_name and table_name not in self.db:
-                self.db.create_table(table_name, primary_id='unique_id',
-                                     primary_type=self.db.types.string(32))
-                self.fix_table(table_name)
-            with self.db:
+                    trans.query('update `%s` set source_id = "%s" where source_id = "%s"' % (table_name, prev_uid, uid))
+            for (table_name, updates) in self._updates.items():
+                if '_' not in table_name and table_name not in trans:
+                    trans.create_table(table_name, primary_id='unique_id',
+                                       primary_type=trans.types.string(32))
+                    if 'sqlite' not in self.path:
+                        self.fix_table(table_name)
                 for (uid, prev_uid) in updates:
-                    self.db.query('update `%s` set unique_id = "%s" where unique_id = "%s"' % (table_name, prev_uid, uid))
-        for (table_name, inserts) in self._inserts.items():
-            if '_' not in table_name and table_name not in self.db:
-                self.db.create_table(table_name, primary_id='unique_id',
-                                     primary_type=self.db.types.string(32))
-                self.fix_table(table_name)
-            self.db[table_name].insert_many(inserts)
-            # print(table_name,inserts)
+                    trans.query('update `%s` set unique_id = "%s" where unique_id = "%s"' % (table_name, prev_uid, uid))
+            for (table_name, inserts) in self._inserts.items():
+                if '_' not in table_name and table_name not in trans:
+                    trans.create_table(table_name, primary_id='unique_id',
+                                       primary_type=trans.types.string(32))
+                    if 'sqlite' not in self.path:
+                        self.fix_table(table_name)
+                trans[table_name].insert_many(inserts)
+                # print(table_name,inserts)
+        with dataset.connect(self.path) as trans:
+            pass
         self.db = None
 
     def create_link_tables(self, klass):
@@ -239,17 +239,17 @@ def create_link_tables(self, klass):
         Create a link table in the database of the given trait klass
         """
         name = self.table_name[klass]
-        self.get_connection()
-        for (tname, trait) in klass.class_traits().items():
-            if isinstance(trait, MetList):
-                table_name = '_'.join([name, tname])
-                if table_name not in self.db:
-                    self.db.create_table(table_name)
-                    link = dict(source_id=uuid.uuid4().hex,
-                                head_id=uuid.uuid4().hex,
-                                target_id=uuid.uuid4().hex,
-                                target_table=uuid.uuid4().hex)
-                    self.db[table_name].insert(link)
+        with dataset.connect(self.path) as trans:
+            for (tname, trait) in klass.class_traits().items():
+                if isinstance(trait, MetList):
+                    table_name = '_'.join([name, tname])
+                    if table_name not in trans:
+                        trans.create_table(table_name)
+                        link = dict(source_id=uuid.uuid4().hex,
+                                    head_id=uuid.uuid4().hex,
+                                    target_id=uuid.uuid4().hex,
+                                    target_table=uuid.uuid4().hex)
+                        trans[table_name].insert(link)
         self.db = None
 
     def _get_save_data(self, obj, override=False):
@@ -322,90 +322,88 @@ def retrieve(self, object_type, **kwargs):
         """Retrieve an object from the database."""
         object_type = object_type.lower()
         klass = self.subclass_lut.get(object_type, None)
-        # with dataset.connect(self.path) as db:
-        # self.db =
-        self.get_connection()
-        if object_type not in self.db:
-            if not klass:
-                raise ValueError('Unknown object type: %s' % object_type)
-            object_type = self.tablename_lut[klass]
-        if '_' not in object_type:
-            if kwargs.get('username', '') in ['*', 'all']:
-                kwargs.pop('username')
-            else:
-                kwargs.setdefault('username', getpass.getuser())
-        # Example query if group id is given
-        # SELECT *
-        # FROM tablename
-        # WHERE (city = 'New York' AND name like 'IBM%')
-
-        # Example query where unique id and group id are not given
-        # (to avoid getting all versions of the same object)
-        # http://stackoverflow.com/a/12102288
-        # SELECT *
-        # from (SELECT * from `groups`
-        #       WHERE (name='spam') ORDER BY last_modified)
-        # x GROUP BY head_id
-        query = 'select * from `%s` where (' % object_type
-        clauses = []
-        for (key, value) in kwargs.items():
-            if type(value) is list and len(value)>0:
-                clauses.append('%s in ("%s")' % (key, '", "'.join(value)))
-            elif not isinstance(value, six.string_types):
-                clauses.append("%s = %s" % (key, value))
-            elif '%%' in value:
-                clauses.append('%s = "%s"' % (key, value.replace('%%', '%')))
-            elif '%' in value:
-                clauses.append('%s like "%s"' % (key, value.replace('*', '%')))
-            else:
-                clauses.append('%s = "%s"' % (key, value))
-        if 'unique_id' not in kwargs and klass:
-            clauses.append('unique_id = head_id')
-        query += ' and '.join(clauses) + ')'
-        if not clauses:
-            query = query.replace(' where ()', '')
-        try:
-            items = [i for i in self.db.query(query)]
-        except Exception as e:
-            if 'Unknown column' in str(e):
-                keys = [k for k in klass.class_traits().keys()
-                        if not k.startswith('_')]
-                raise ValueError('Invalid column name, valid columns: %s' % keys)
-            else:
-                raise(e)
-        #print(query+'\n')
-        # print('tables:')
-        # print([t for t in self.db.query('show tables')])
-        items = [klass(**i) for i in items]
-        uids = [i.unique_id for i in items]
-        if not items:
-            return []
-        # get stubs for each of the list items
-        for (tname, trait) in items[0].traits().items():
-            if isinstance(trait, List):
-                table_name = '_'.join([object_type, tname])
-                if table_name not in self.db:
+        with dataset.connect(self.path) as trans:
+            if object_type not in trans:
+                if not klass:
+                    raise ValueError('Unknown object type: %s' % object_type)
+                object_type = self.tablename_lut[klass]
+            if '_' not in object_type:
+                if kwargs.get('username', '') in ['*', 'all']:
+                    kwargs.pop('username')
+                else:
+                    kwargs.setdefault('username', getpass.getuser())
+            # Example query if group id is given
+            # SELECT *
+            # FROM tablename
+            # WHERE (city = 'New York' AND name like 'IBM%')
+
+            # Example query where unique id and group id are not given
+            # (to avoid getting all versions of the same object)
+            # http://stackoverflow.com/a/12102288
+            # SELECT *
+            # from (SELECT * from `groups`
+            #       WHERE (name='spam') ORDER BY last_modified)
+            # x GROUP BY head_id
+            query = 'select * from `%s` where (' % object_type
+            clauses = []
+            for (key, value) in kwargs.items():
+                if type(value) is list and len(value)>0:
+                    clauses.append('%s in ("%s")' % (key, '", "'.join(value)))
+                elif not isinstance(value, six.string_types):
+                    clauses.append("%s = %s" % (key, value))
+                elif '%%' in value:
+                    clauses.append('%s = "%s"' % (key, value.replace('%%', '%')))
+                elif '%' in value:
+                    clauses.append('%s like "%s"' % (key, value.replace('*', '%')))
+                else:
+                    clauses.append('%s = "%s"' % (key, value))
+            if 'unique_id' not in kwargs and klass:
+                clauses.append('unique_id = head_id')
+            query += ' and '.join(clauses) + ')'
+            if not clauses:
+                query = query.replace(' where ()', '')
+            try:
+                items = [i for i in trans.query(query)]
+            except Exception as e:
+                if 'Unknown column' in str(e):
+                    keys = [k for k in klass.class_traits().keys()
+                            if not k.startswith('_')]
+                    raise ValueError('Invalid column name, valid columns: %s' % keys)
+                else:
+                    raise(e)
+            #print(query+'\n')
+            # print('tables:')
+            # print([t for t in self.db.query('show tables')])
+            items = [klass(**i) for i in items]
+            uids = [i.unique_id for i in items]
+            if not items:
+                return []
+            # get stubs for each of the list items
+            for (tname, trait) in items[0].traits().items():
+                if isinstance(trait, List):
+                    table_name = '_'.join([object_type, tname])
+                    if table_name not in trans:
+                        for i in items:
+                            setattr(i, tname, [])
+                        continue
+                    querystr = 'select * from `%s` where source_id in ("' % table_name
+                    querystr += '" , "'.join(uids)
+                    #print(querystr+'\n')
+                    result = trans.query(querystr + '")')
+                    sublist = defaultdict(list)
+                    for r in result:
+                        stub = Stub(unique_id=r['target_id'],
+                                    object_type=r['target_table'])
+                        sublist[r['source_id']].append(stub)
                     for i in items:
-                        setattr(i, tname, [])
-                    continue
-                querystr = 'select * from `%s` where source_id in ("' % table_name
-                querystr += '" , "'.join(uids)
-                #print(querystr+'\n')
-                result = self.db.query(querystr + '")')
-                sublist = defaultdict(list)
-                for r in result:
-                    stub = Stub(unique_id=r['target_id'],
-                                object_type=r['target_table'])
-                    sublist[r['source_id']].append(stub)
-                for i in items:
-                    setattr(i, tname, sublist[i.unique_id])
-            elif isinstance(trait, MetInstance):
-                pass
-        for i in items:
-            if not i.prev_uid:
-                i.prev_uid = 'origin'
-            i._changed = False
-        items.sort(key=lambda x: x.last_modified)
+                        setattr(i, tname, sublist[i.unique_id])
+                elif isinstance(trait, MetInstance):
+                    pass
+            for i in items:
+                if not i.prev_uid:
+                    i.prev_uid = 'origin'
+                i._changed = False
+            items.sort(key=lambda x: x.last_modified)
 
         return items
 
@@ -445,34 +443,34 @@ def remove(self, object_type, **kwargs):
                 clauses.append('%s = "%s"' % (key, value))
         query += ' and '.join(clauses)
         query += ')'
-        self.get_connection()
         if not clauses:
             query = query.replace(' where ()', '')
-        # check for lists items that need removal
-        if any([isinstance(i, MetList) for i in klass.class_traits().values()]):
-            uid_query = query.replace('delete ', 'select unique_id ')
-            uids = [i['unique_id'] for i in self.db.query(uid_query)]
-            sub_query = 'delete from `%s` where source_id in ("%s")'
-            for (tname, trait) in klass.class_traits().items():
-                table_name = '%s_%s' % (object_type, tname)
-                if not uids or table_name not in self.db:
-                    continue
-                if isinstance(trait, MetList):
-                    table_query = sub_query % (table_name, '", "'.join(uids))
-                    try:
-                        self.db.query(table_query)
-                    except Exception as e:
-                        print(e)
-        try:
-            self.db.query(query)
-        except Exception as e:
-            if 'Unknown column' in str(e):
-                keys = [k for k in klass.class_traits().keys()
-                        if not k.startswith('_')]
-                raise ValueError('Invalid column name, valid columns: %s' % keys)
-            else:
-                raise(e)
-        print('Removed')
+        with dataset.connect(self.path) as trans:
+            # check for lists items that need removal
+            if any([isinstance(i, MetList) for i in klass.class_traits().values()]):
+                uid_query = query.replace('delete ', 'select unique_id ')
+                uids = [i['unique_id'] for i in trans.query(uid_query)]
+                sub_query = 'delete from `%s` where source_id in ("%s")'
+                for (tname, trait) in klass.class_traits().items():
+                    table_name = '%s_%s' % (object_type, tname)
+                    if not uids or table_name not in trans:
+                        continue
+                    if isinstance(trait, MetList):
+                        table_query = sub_query % (table_name, '", "'.join(uids))
+                        try:
+                            trans.query(table_query)
+                        except Exception as e:
+                            print(e)
+            try:
+                trans.query(query)
+            except Exception as e:
+                if 'Unknown column' in str(e):
+                    keys = [k for k in klass.class_traits().keys()
+                            if not k.startswith('_')]
+                    raise ValueError('Invalid column name, valid columns: %s' % keys)
+                else:
+                    raise(e)
+            print('Removed')
         self.db = None
 
     def remove_objects(self, objects, all_versions=True, **kwargs):
@@ -496,26 +494,26 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
         ids = defaultdict(list)
         username = getpass.getuser()
         attr = 'head_id' if all_versions else 'unique_id'
-        self.get_connection()
-        for obj in objects:
-            if not override and obj.username != username:
-                continue
-            name = self.tablename_lut[obj.__class__]
-            ids[name].append(getattr(obj, attr))
-            # remove list items as well
-            for (tname, trait) in obj.traits().items():
-                if isinstance(trait, MetList):
-                    subname = '%s_%s' % (name, tname)
-                    ids[subname].append(getattr(obj, attr))
-        for (table_name, uids) in ids.items():
-            if table_name not in self.db:
-                continue
-            query = 'delete from `%s` where %s in ("'
-            query = query % (table_name, attr)
-            query += '" , "'.join(uids)
-            query += '")'
-            self.db.query(query)
-        print(('Removed %s object(s)' % len(objects)))
+        with dataset.connect(self.path) as trans:
+            for obj in objects:
+                if not override and obj.username != username:
+                    continue
+                name = self.tablename_lut[obj.__class__]
+                ids[name].append(getattr(obj, attr))
+                # remove list items as well
+                for (tname, trait) in obj.traits().items():
+                    if isinstance(trait, MetList):
+                        subname = '%s_%s' % (name, tname)
+                        ids[subname].append(getattr(obj, attr))
+            for (table_name, uids) in ids.items():
+                if table_name not in trans:
+                    continue
+                query = 'delete from `%s` where %s in ("'
+                query = query % (table_name, attr)
+                query += '" , "'.join(uids)
+                query += '")'
+                trans.query(query)
+            print(('Removed %s object(s)' % len(objects)))
         self.db = None
 
 
diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index 5868a3c5..ae8aecaa 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -5,8 +5,6 @@
 import os
 import tempfile
 
-import pandas as pd
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index a56a0b69..32dd87f4 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -22,7 +22,7 @@
     "# pylint: disable=invalid-name,missing-module-docstring\n",
     "\n",
     "# source atlas name\n",
-    "source_atlas = 'REPLACE ME'\n",
+    "source_atlas = \"REPLACE ME\"\n",
     "\n",
     "# this atlas will be copied to an atlas named projectId_experimentName_sampleSet_polarity_analysisId\n",
     "# where projectId is JGI Proposal ID Number\n",
@@ -148,7 +148,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ids = mads.AnalysisIdentifiers(source_atlas, experiment, output_type, polarity, analysis_number, project_directory)"
+    "ids = mads.AnalysisIdentifiers(\n",
+    "    source_atlas, experiment, output_type, polarity, analysis_number, project_directory\n",
+    ")"
    ]
   },
   {
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/fixtures/analysis_identifiers_fixtures.py b/tests/fixtures/analysis_identifiers_fixtures.py
deleted file mode 100644
index 5ac3ef25..00000000
--- a/tests/fixtures/analysis_identifiers_fixtures.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# pylint: disable=missing-function-docstring, missing-module-docstring
-
-import pytest
-
-from metatlas.datastructures import metatlas_dataset as mads
-
-
-@pytest.fixture(name="analysis_ids")
-def fixture_analysis_ids(tmp_path, mocker):
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
-    return mads.AnalysisIdentifiers(
-        "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
-        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-        "FinalEMA-HILIC",
-        "positive",
-        0,
-        str(tmp_path),
-        username="root",
-    )
diff --git a/tests/fixtures/database.py b/tests/fixtures/database.py
deleted file mode 100644
index 7064831d..00000000
--- a/tests/fixtures/database.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# /pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
-
-import getpass
-import os
-import sqlite3
-import pytest
-from metatlas.datastructures import metatlas_objects as metob
-
-
-@pytest.fixture(name="sqlite")
-def fixture_sqlite(tmp_path, monkeypatch):
-    # make sure we don't accidently pollute the production MySQL DB
-    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
-    db_path = tmp_path / "workspace.db"
-    monkeypatch.setenv("METATLAS_SQLITE", str(db_path))
-    sqlite3.connect(db_path).close()
-    dummy = metob.Atlas()
-    dummy.name = "this is a dummy atlas to initialize sqlite db"
-    metob.store(dummy)
diff --git a/tests/fixtures/hdf5_fixtures.py b/tests/fixtures/hdf5_fixtures.py
deleted file mode 100644
index cfbb6d36..00000000
--- a/tests/fixtures/hdf5_fixtures.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
-
-import pytest
-import pandas as pd
-
-
-@pytest.fixture(name="ms1_pos")
-def fixture_ms1_pos():
-    return pd.DataFrame(
-        data={
-            "mz": {
-                "0": 252.1089324951,
-                "1": 252.1090087891,
-                "2": 252.1088104248,
-                "3": 252.1090087891,
-                "4": 252.10887146,
-                "5": 252.1089324951,
-                "6": 252.1089324951,
-                "7": 252.1088256836,
-                "8": 252.1088867188,
-                "9": 252.1090393066,
-                "10": 252.1089782715,
-                "11": 252.1089630127,
-                "12": 252.1089630127,
-                "13": 252.1089782715,
-                "14": 252.1090240479,
-                "15": 252.1089782715,
-                "16": 252.1090240479,
-                "17": 252.1089324951,
-                "18": 252.1090393066,
-                "19": 252.1088867188,
-                "20": 252.10887146,
-                "21": 252.1089324951,
-                "22": 252.1089630127,
-                "23": 252.1089935303,
-                "24": 252.1089172363,
-                "25": 252.1089477539,
-                "26": 252.1090545654,
-                "27": 252.1089630127,
-                "28": 252.1090240479,
-                "29": 252.1090087891,
-                "30": 252.1090393066,
-                "31": 252.1090240479,
-                "32": 252.1089935303,
-                "33": 252.1090240479,
-                "34": 252.1089630127,
-                "35": 252.1090087891,
-                "36": 252.1090240479,
-                "37": 252.1089172363,
-                "38": 252.1089019775,
-                "39": 252.1089477539,
-                "40": 252.1089324951,
-                "41": 252.1089477539,
-                "42": 252.1089477539,
-                "43": 252.1089477539,
-                "44": 252.1089782715,
-                "45": 252.1088867188,
-                "46": 252.1089172363,
-                "47": 252.1089324951,
-                "48": 252.1089782715,
-                "49": 252.1089477539,
-                "50": 252.1089172363,
-                "51": 252.1089324951,
-                "52": 252.1089630127,
-                "53": 252.1088867188,
-                "54": 252.1089630127,
-                "55": 252.1085205078,
-                "56": 252.1090545654,
-                "57": 252.1089935303,
-                "58": 252.1088104248,
-                "59": 252.1086578369,
-                "60": 252.1089935303,
-                "61": 252.1085510254,
-                "62": 252.1082763672,
-                "63": 252.1082458496,
-                "64": 252.1084136963,
-                "65": 252.1092224121,
-                "66": 252.1091766357,
-                "67": 252.1092834473,
-                "68": 252.1087493896,
-                "69": 252.1112518311,
-                "70": 252.1088409424,
-                "71": 252.1086425781,
-                "72": 252.1091766357,
-                "73": 252.1094055176,
-            },
-            "i": {
-                "0": 312203.5,
-                "1": 387914.59375,
-                "2": 308308.5,
-                "3": 334653.59375,
-                "4": 339521.625,
-                "5": 345527.21875,
-                "6": 292437.34375,
-                "7": 413614.53125,
-                "8": 300285.28125,
-                "9": 383848.71875,
-                "10": 404313.21875,
-                "11": 377231.34375,
-                "12": 453965.5625,
-                "13": 431327.0,
-                "14": 523180.0625,
-                "15": 510239.8125,
-                "16": 631459.1875,
-                "17": 807419.5,
-                "18": 842647.5625,
-                "19": 1053031.625,
-                "20": 1082361.625,
-                "21": 1198966.625,
-                "22": 1109162.375,
-                "23": 1126347.125,
-                "24": 1373071.5,
-                "25": 1589018.375,
-                "26": 1281309.875,
-                "27": 1660166.75,
-                "28": 1492912.25,
-                "29": 2029801.5,
-                "30": 2029874.125,
-                "31": 2035966.625,
-                "32": 2010867.875,
-                "33": 2036981.375,
-                "34": 2148879.25,
-                "35": 2359861.25,
-                "36": 2054066.125,
-                "37": 1691976.0,
-                "38": 1778159.125,
-                "39": 1776166.125,
-                "40": 1752154.125,
-                "41": 1575676.875,
-                "42": 1199910.625,
-                "43": 1259708.25,
-                "44": 1087384.375,
-                "45": 826077.125,
-                "46": 802296.875,
-                "47": 547785.125,
-                "48": 545340.0625,
-                "49": 584624.4375,
-                "50": 468524.8125,
-                "51": 305931.1875,
-                "52": 330310.34375,
-                "53": 309740.625,
-                "54": 289212.71875,
-                "55": 230440.9375,
-                "56": 210549.390625,
-                "57": 169972.390625,
-                "58": 140521.234375,
-                "59": 116637.953125,
-                "60": 117197.625,
-                "61": 84652.1171875,
-                "62": 117615.578125,
-                "63": 103500.921875,
-                "64": 89320.9453125,
-                "65": 76313.9296875,
-                "66": 55575.00390625,
-                "67": 76784.6796875,
-                "68": 28829.162109375,
-                "69": 26051.6171875,
-                "70": 42957.18359375,
-                "71": 50342.6953125,
-                "72": 37611.33984375,
-                "73": 38202.83203125,
-            },
-            "rt": {
-                "0": 2.1030805111,
-                "1": 2.1084616184,
-                "2": 2.1139531136,
-                "3": 2.1193552017,
-                "4": 2.1248509884,
-                "5": 2.1302509308,
-                "6": 2.135682106,
-                "7": 2.1411821842,
-                "8": 2.1459801197,
-                "9": 2.1513926983,
-                "10": 2.1568279266,
-                "11": 2.1622362137,
-                "12": 2.1676549911,
-                "13": 2.1730883121,
-                "14": 2.179015398,
-                "15": 2.1845297813,
-                "16": 2.1900422573,
-                "17": 2.1949694157,
-                "18": 2.20002985,
-                "19": 2.2055358887,
-                "20": 2.2110378742,
-                "21": 2.2165191174,
-                "22": 2.2219588757,
-                "23": 2.2273921967,
-                "24": 2.2328462601,
-                "25": 2.2382712364,
-                "26": 2.2437169552,
-                "27": 2.2492566109,
-                "28": 2.2547125816,
-                "29": 2.2601687908,
-                "30": 2.2656960487,
-                "31": 2.2704958916,
-                "32": 2.2758042812,
-                "33": 2.2813498974,
-                "34": 2.2868082523,
-                "35": 2.2922415733,
-                "36": 2.2976748943,
-                "37": 2.3031060696,
-                "38": 2.308131218,
-                "39": 2.313628912,
-                "40": 2.3185498714,
-                "41": 2.3239560127,
-                "42": 2.3293914795,
-                "43": 2.3349123001,
-                "44": 2.3403663635,
-                "45": 2.346799612,
-                "46": 2.3522267342,
-                "47": 2.3576600552,
-                "48": 2.3631224632,
-                "49": 2.3685662746,
-                "50": 2.3740911484,
-                "51": 2.3794057369,
-                "52": 2.3848536015,
-                "53": 2.3903660774,
-                "54": 2.3953785896,
-                "55": 2.4006638527,
-                "56": 2.4062638283,
-                "57": 2.411709547,
-                "58": 2.4171659946,
-                "59": 2.4226117134,
-                "60": 2.4302260876,
-                "61": 2.4357616901,
-                "62": 2.4407405853,
-                "63": 2.4461927414,
-                "64": 2.451615572,
-                "65": 2.4571509361,
-                "66": 2.4627010822,
-                "67": 2.4681572914,
-                "68": 2.4735822678,
-                "69": 2.4735822678,
-                "70": 2.4787945747,
-                "71": 2.4842174053,
-                "72": 2.4896612167,
-                "73": 2.495146513,
-            },
-            "polarity": {
-                "0": 1,
-                "1": 1,
-                "2": 1,
-                "3": 1,
-                "4": 1,
-                "5": 1,
-                "6": 1,
-                "7": 1,
-                "8": 1,
-                "9": 1,
-                "10": 1,
-                "11": 1,
-                "12": 1,
-                "13": 1,
-                "14": 1,
-                "15": 1,
-                "16": 1,
-                "17": 1,
-                "18": 1,
-                "19": 1,
-                "20": 1,
-                "21": 1,
-                "22": 1,
-                "23": 1,
-                "24": 1,
-                "25": 1,
-                "26": 1,
-                "27": 1,
-                "28": 1,
-                "29": 1,
-                "30": 1,
-                "31": 1,
-                "32": 1,
-                "33": 1,
-                "34": 1,
-                "35": 1,
-                "36": 1,
-                "37": 1,
-                "38": 1,
-                "39": 1,
-                "40": 1,
-                "41": 1,
-                "42": 1,
-                "43": 1,
-                "44": 1,
-                "45": 1,
-                "46": 1,
-                "47": 1,
-                "48": 1,
-                "49": 1,
-                "50": 1,
-                "51": 1,
-                "52": 1,
-                "53": 1,
-                "54": 1,
-                "55": 1,
-                "56": 1,
-                "57": 1,
-                "58": 1,
-                "59": 1,
-                "60": 1,
-                "61": 1,
-                "62": 1,
-                "63": 1,
-                "64": 1,
-                "65": 1,
-                "66": 1,
-                "67": 1,
-                "68": 1,
-                "69": 1,
-                "70": 1,
-                "71": 1,
-                "72": 1,
-                "73": 1,
-            },
-        }
-    )
-
-
-@pytest.fixture(name="ms2_pos")
-def fixture_ms2_pos():
-    return pd.DataFrame(
-        data={
-            "mz": {
-                "0": 252.1081695557,
-                "1": 252.1564941406,
-                "2": 252.1087036133,
-                "3": 252.1572875977,
-                "4": 252.1089019775,
-                "5": 252.1550292969,
-                "6": 252.1090698242,
-                "7": 252.1557617188,
-            },
-            "i": {
-                "0": 32103.3515625,
-                "1": 6470.0009765625,
-                "2": 93112.0859375,
-                "3": 7624.11328125,
-                "4": 131062.0,
-                "5": 6535.4560546875,
-                "6": 76976.7265625,
-                "7": 6090.6440429688,
-            },
-            "rt": {
-                "0": 2.0097544193,
-                "1": 2.0097544193,
-                "2": 2.2203779221,
-                "3": 2.2203779221,
-                "4": 2.327804327,
-                "5": 2.327804327,
-                "6": 2.3452186584,
-                "7": 2.3452186584,
-            },
-            "polarity": {"0": 1, "1": 1, "2": 1, "3": 1, "4": 1, "5": 1, "6": 1, "7": 1},
-            "precursor_MZ": {
-                "0": 252.0195159912,
-                "1": 252.0195159912,
-                "2": 252.10887146,
-                "3": 252.10887146,
-                "4": 252.0194854736,
-                "5": 252.0194854736,
-                "6": 252.1089477539,
-                "7": 252.1089477539,
-            },
-            "precursor_intensity": {
-                "0": 2748235.5,
-                "1": 2748235.5,
-                "2": 2872807.5,
-                "3": 2872807.5,
-                "4": 3536752.25,
-                "5": 3536752.25,
-                "6": 3046732.75,
-                "7": 3046732.75,
-            },
-            "collision_energy": {
-                "0": 23.3333339691,
-                "1": 23.3333339691,
-                "2": 23.3333339691,
-                "3": 23.3333339691,
-                "4": 23.3333339691,
-                "5": 23.3333339691,
-                "6": 23.3333339691,
-                "7": 23.3333339691,
-            },
-        }
-    )
-
-
-@pytest.fixture(name="ms1_neg_empty")
-def fixture_ms1_neg_empty():
-    return pd.DataFrame(data={"mz": {}, "i": {}, "rt": {}, "polarity": {}})
-
-
-@pytest.fixture(name="ms2_neg_empty")
-def fixture_ms2_neg_empty():
-    return pd.DataFrame(
-        data={
-            "mz": {},
-            "i": {},
-            "rt": {},
-            "polarity": {},
-            "precursor_MZ": {},
-            "precursor_intensity": {},
-            "collision_energy": {},
-        }
-    )
-
-
-@pytest.fixture
-def df_container(ms1_pos, ms2_pos, ms1_neg_empty, ms2_neg_empty):
-    return {"ms1_neg": ms1_neg_empty, "ms1_pos": ms1_pos, "ms2_neg": ms2_neg_empty, "ms2_pos": ms2_pos}
diff --git a/tests/fixtures/metatlas_dataset_fixtures.py b/tests/fixtures/metatlas_dataset_fixtures.py
deleted file mode 100644
index 070eacfc..00000000
--- a/tests/fixtures/metatlas_dataset_fixtures.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring, too-many-arguments
-
-import pytest
-import numpy as np
-
-from metatlas.datastructures import metatlas_dataset as mads
-
-
-@pytest.fixture(name="ms1_summary")
-def fixture_ms1_summary():
-    return {
-        "num_ms1_datapoints": 85.0,
-        "mz_peak": 252.1092987060547,
-        "rt_peak": 2.2775044441223145,
-        "mz_centroid": 252.10915042669814,
-        "rt_centroid": 2.218492414487913,
-        "peak_height": 304761.90625,
-        "peak_area": 7696977.46875,
-    }
-
-
-@pytest.fixture(name="msms")
-def fixture_msms():
-    return {
-        "data": {
-            "mz": np.array([], dtype=np.float64),
-            "i": np.array([], dtype=np.float64),
-            "rt": np.array([], dtype=np.float64),
-            "polarity": np.array([], dtype=np.float64),
-            "precursor_MZ": np.array([], dtype=np.float64),
-            "precursor_intensity": np.array([], dtype=np.float64),
-            "collision_energy": np.array([], dtype=np.float64),
-        }
-    }
-
-
-@pytest.fixture(name="groups_controlled_vocab")
-def fixture_groups_controlled_vocab():
-    return ["QC", "InjBl", "ISTD"]
-
-
-@pytest.fixture(name="metatlas_dataset")
-def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, atlas):
-    mocker.patch(
-        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
-    )
-    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    mocker.patch("metatlas.datastructures.metatlas_dataset.get_atlas", return_value=atlas)
-    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
-
-
-@pytest.fixture(name="metatlas_dataset_with_2_cids")
-def fixture_metatlas_dataset_with_2_cids(
-    mocker, df_container, analysis_ids, groups_controlled_vocab, atlas_with_2_cids, lcmsrun
-):
-    mocker.patch(
-        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
-    )
-    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    mocker.patch("metatlas.datastructures.metatlas_dataset.get_atlas", return_value=atlas_with_2_cids)
-    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
-
-
-@pytest.fixture(name="eic")
-def fixture_eic():
-    return {
-        "mz": [
-            252.1089324951172,
-            252.10943603515625,
-            252.10926818847656,
-            252.109375,
-            252.10923767089844,
-            252.10910034179688,
-            252.10914611816406,
-            252.1089630126953,
-            252.10971069335938,
-            252.1093292236328,
-            252.10934448242188,
-            252.109130859375,
-            252.10935974121094,
-            252.10939025878906,
-            252.1090545654297,
-            252.10916137695312,
-            252.10946655273438,
-            252.10923767089844,
-            252.1093292236328,
-            252.10919189453125,
-            252.10914611816406,
-            252.10897827148438,
-            252.10934448242188,
-            252.10928344726562,
-            252.10888671875,
-            252.10926818847656,
-            252.109130859375,
-            252.1090087890625,
-            252.10934448242188,
-            252.10939025878906,
-            252.1093292236328,
-            252.1091766357422,
-            252.109130859375,
-            252.1095428466797,
-            252.10890197753906,
-            252.1095428466797,
-            252.109130859375,
-            252.10911560058594,
-            252.1091766357422,
-            252.1088409423828,
-            252.10916137695312,
-            252.10935974121094,
-            252.10928344726562,
-            252.10922241210938,
-            252.10914611816406,
-            252.10922241210938,
-            252.10894775390625,
-            252.10906982421875,
-            252.10914611816406,
-            252.10916137695312,
-            252.10910034179688,
-            252.10916137695312,
-            252.10934448242188,
-            252.10899353027344,
-            252.10928344726562,
-            252.10897827148438,
-            252.10916137695312,
-            252.10928344726562,
-            252.1092987060547,
-            252.1089324951172,
-            252.10914611816406,
-            252.1090545654297,
-            252.10914611816406,
-            252.1090850830078,
-            252.10894775390625,
-            252.10914611816406,
-            252.10911560058594,
-            252.1090850830078,
-            252.109130859375,
-            252.10903930664062,
-            252.10890197753906,
-            252.109130859375,
-            252.10885620117188,
-            252.10914611816406,
-            252.10926818847656,
-            252.10888671875,
-            252.109619140625,
-            252.10922241210938,
-            252.1092529296875,
-            252.1099853515625,
-            252.10972595214844,
-            252.10910034179688,
-            252.10935974121094,
-            252.1088409423828,
-            252.10838317871094,
-            252.11212158203125,
-        ],
-        "rt": [
-            1.7180122137069702,
-            1.8222843408584595,
-            1.838305115699768,
-            1.8444031476974487,
-            1.8705799579620361,
-            1.875998616218567,
-            1.8913277387619019,
-            1.9020838737487793,
-            1.9127358198165894,
-            1.9397128820419312,
-            1.9451169967651367,
-            1.9505127668380737,
-            1.955920934677124,
-            1.966427206993103,
-            1.9718105792999268,
-            1.9769750833511353,
-            1.9823375940322876,
-            1.987752079963684,
-            1.9932082891464233,
-            1.9986457824707031,
-            2.0094456672668457,
-            2.019866466522217,
-            2.030582904815674,
-            2.036003589630127,
-            2.0568389892578125,
-            2.062201499938965,
-            2.0675911903381348,
-            2.0834577083587646,
-            2.088857650756836,
-            2.0939910411834717,
-            2.099109649658203,
-            2.104536771774292,
-            2.1208388805389404,
-            2.1262447834014893,
-            2.1420176029205322,
-            2.152921676635742,
-            2.15836763381958,
-            2.163788318634033,
-            2.169198751449585,
-            2.1755259037017822,
-            2.180954933166504,
-            2.18635892868042,
-            2.191038131713867,
-            2.1964569091796875,
-            2.2018840312957764,
-            2.2069132328033447,
-            2.21236515045166,
-            2.2177650928497314,
-            2.2228589057922363,
-            2.2283151149749756,
-            2.2338151931762695,
-            2.239321231842041,
-            2.244842052459717,
-            2.250317096710205,
-            2.255610704421997,
-            2.261033535003662,
-            2.2665293216705322,
-            2.2720251083374023,
-            2.2775044441223145,
-            2.28295636177063,
-            2.288454294204712,
-            2.29386043548584,
-            2.299298048019409,
-            2.304720878601074,
-            2.310127019882202,
-            2.3155603408813477,
-            2.320981025695801,
-            2.326420545578003,
-            2.33160400390625,
-            2.3370935916900635,
-            2.3428516387939453,
-            2.3483099937438965,
-            2.3535475730895996,
-            2.3589975833892822,
-            2.364443302154541,
-            2.3699119091033936,
-            2.375347375869751,
-            2.3808369636535645,
-            2.3862972259521484,
-            2.3917577266693115,
-            2.397282600402832,
-            2.402780294418335,
-            2.4081971645355225,
-            2.419055461883545,
-            2.457223892211914,
-            3.3080079555511475,
-        ],
-        "intensity": [
-            34249.71484375,
-            28511.658203125,
-            41718.13671875,
-            33448.546875,
-            40190.94140625,
-            32525.16015625,
-            37058.60546875,
-            51132.91015625,
-            36473.0546875,
-            42659.0859375,
-            45187.6171875,
-            51186.30078125,
-            58456.5859375,
-            43299.24609375,
-            52062.02734375,
-            42501.8671875,
-            39734.91015625,
-            41848.02734375,
-            48979.640625,
-            42957.48046875,
-            54214.27734375,
-            63583.64453125,
-            38661.046875,
-            47146.54296875,
-            36974.3046875,
-            37674.35546875,
-            37412.4609375,
-            47036.44921875,
-            32295.888671875,
-            39751.12109375,
-            47359.0,
-            57496.41796875,
-            33690.4765625,
-            36853.53515625,
-            33045.0703125,
-            33235.64453125,
-            52481.1015625,
-            48210.37109375,
-            62178.734375,
-            73049.2109375,
-            52741.03125,
-            88225.1953125,
-            101593.296875,
-            127965.625,
-            124079.859375,
-            134410.46875,
-            148749.0,
-            134068.8125,
-            141625.515625,
-            202721.015625,
-            204341.703125,
-            172160.484375,
-            185859.765625,
-            195729.234375,
-            216657.453125,
-            239248.65625,
-            172232.296875,
-            195105.046875,
-            304761.90625,
-            181052.265625,
-            222467.5625,
-            251571.53125,
-            205874.765625,
-            224279.0625,
-            173697.359375,
-            236325.078125,
-            153999.28125,
-            156835.59375,
-            118963.8046875,
-            105766.234375,
-            103081.484375,
-            97180.5625,
-            95681.4140625,
-            74239.0703125,
-            69208.8984375,
-            60604.1484375,
-            37020.84765625,
-            32874.484375,
-            24641.875,
-            23305.75,
-            23413.94140625,
-            42582.77734375,
-            35980.16796875,
-            25743.97265625,
-            21777.99609375,
-            59454.40234375,
-        ],
-    }
-
-
-@pytest.fixture(name="atlas_df")
-def fixture_atlas_df(metatlas_dataset):
-    return metatlas_dataset.atlas_df
diff --git a/tests/fixtures/metatlas_object_fixtures.py b/tests/fixtures/metatlas_object_fixtures.py
deleted file mode 100644
index 81945355..00000000
--- a/tests/fixtures/metatlas_object_fixtures.py
+++ /dev/null
@@ -1,646 +0,0 @@
-# /pylint: disable=line-too-long, missing-function-docstring, missing-module-docstring
-
-import pandas as pd
-import pytest
-from metatlas.datastructures import metatlas_objects as metob
-
-
-@pytest.fixture(name="compound")
-def fixture_compound():
-    compound = metob.Compound()
-    compound.unique_id = "60cd6743e56545c6a6cb066ec3553450"
-    compound.mono_isotopic_molecular_weight = 251.101839276
-    compound.creation_time = 1466212395
-    compound.synonyms = "2'-deoxyadenosine"  # value was pruned down
-    compound.inchi_key = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
-    compound.chebi_url = "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"
-    compound.permanent_charge = 0
-    compound.img_abc_id = ""
-    compound.neutralized_2d_inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"  # noqa: E501
-    compound.lipidmaps_url = ""
-    compound.source = "gnps///chebi///metacyc///hmdb"
-    compound.kegg_url = "http://www.genome.jp/dbget-bin/www_bget?C00559"
-    compound.hmdb_url = "http://www.hmdb.ca/metabolites/HMDB00101"
-    compound.wikipedia_url = ""
-    compound.head_id = "60cd6743e56545c6a6cb066ec3553450"
-    compound.formula = "C10H13N5O3"
-    compound.number_components = 1
-    compound.iupac_name = ""
-    compound.username = "wjholtz"
-    compound.pubchem_compound_id = "13730"
-    compound.description = "A purine 2'-deoxyribonucleoside having adenine as the nucleobase."
-    compound.metacyc_id = "DEOXYADENOSINE"
-    compound.kegg_id = "C00559"
-    compound.hmdb_id = "HMDB00101"
-    compound.chebi_id = "CHEBI:17256"
-    compound.inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"  # noqa: E501
-    compound.neutralized_inchi_key = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
-    compound.prev_uid = "origin"
-    compound.neutralized_inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"  # noqa: E501
-    compound.name = "2'-deoxyadenosine"
-    compound.neutralized_2d_inchi_key = "OLXZPDWKRNYJJZ-UHFFFAOYSA-N"
-    compound.num_free_radicals = 0
-    compound.lipidmaps_id = ""
-    compound.last_modified = 1612996604
-    compound.pubchem_url = "http://pubchem.ncbi.nlm.nih.gov/compound/13730"
-    return compound
-
-
-@pytest.fixture(name="rt_reference")
-def fixture_rt_reference():
-    rt_ref = metob.RtReference()
-    rt_ref.unique_id = "a845ddfdf8ef4713bcef3bdb84999030"
-    rt_ref.username = "wjholtz"
-    rt_ref.rt_units = "min"
-    rt_ref.description = "No description"
-    rt_ref.rt_peak = "2.1964640053707174"
-    rt_ref.enabled = True
-    rt_ref.creation_time = 1613002850
-    rt_ref.lcms_run = None
-    rt_ref.rt_min = 1.6964640053707174
-    rt_ref.last_modified = 1613002979
-    rt_ref.ref_type = ""
-    rt_ref.prev_uid = "origin"
-    rt_ref.rt_max = 2.6964640053707174
-    rt_ref.name = "Untitled"
-    rt_ref.head_id = "a845ddfdf8ef4713bcef3bdb84999030"
-    return rt_ref
-
-
-@pytest.fixture(name="mz_reference")
-def fixture_mz_reference():
-    mz_ref = metob.MzReference()
-    mz_ref.unique_id = "eb6d03c9ef574051b92dad7b2fc259a2"
-    mz_ref.username = "wjholtz"
-    mz_ref.adduct = "[M+H]+"
-    mz_ref.description = "No description"
-    mz_ref.mz_tolerance_units = "ppm"
-    mz_ref.enabled = True
-    mz_ref.mz = 252.1091393
-    mz_ref.creation_time = 1613002850
-    mz_ref.lcms_run = None
-    mz_ref.mz_tolerance = 20.0
-    mz_ref.last_modified = 1613002979
-    mz_ref.detected_polarity = "positive"
-    mz_ref.modification = ""
-    mz_ref.ref_type = ""
-    mz_ref.observed_formula = ""
-    mz_ref.prev_uid = "origin"
-    mz_ref.name = "Untitled"
-    mz_ref.head_id = "eb6d03c9ef574051b92dad7b2fc259a2"
-    return mz_ref
-
-
-@pytest.fixture(name="compound_identification")
-def fixture_compound_identification(compound, rt_reference, mz_reference):
-    ident = metob.CompoundIdentification()
-    ident.unique_id = "18737c7141cc4efaa4545bead13ac751"
-    ident.username = "wjholtz"
-    ident.description = "No description"
-    ident.creation_time = 1613002849
-    ident.last_modified = 1613002979
-    ident.identification_grade = None
-    ident.prev_uid = "origin"
-    ident.name = "2'-deoxyadenosine"
-    ident.head_id = "18737c7141cc4efaa4545bead13ac751"
-    ident.internal_standard_to_use = ""
-    ident.internal_standard_id = ""
-    ident.do_normalization = False
-    ident.identification_notes = "my id note"
-    ident.ms2_notes = "bad match to ref"
-    ident.ms1_notes = "keep"
-    ident.frag_references = []
-    ident.intensity_references = []
-    ident.compound = [compound]
-    ident.mz_references = [mz_reference]
-    ident.rt_references = [rt_reference]
-    return ident
-
-
-@pytest.fixture(name="atlas")
-def fixture_atlas(compound_identification):
-    small_atlas = metob.Atlas()
-    small_atlas.compound_identifications = [compound_identification]
-    return small_atlas
-
-
-@pytest.fixture(name="compound_2")
-def fixture_compound_2():
-    compound = metob.Compound()
-    compound.chebi_id = "CHEBI:16335"
-    compound.chebi_url = "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:16335"
-    compound.creation_time = 1466212384
-    compound.description = "A ribonucleoside composed of a molecule of adenine attached to a ribofuranose moiety via a beta1N9-glycosidic bond."
-    compound.formula = "C10H13N5O4"
-    compound.head_id = "1ad02275f47b4033a451e99874f4764f"
-    compound.hmdb_id = "HMDB00050"
-    compound.hmdb_url = "http://www.hmdb.ca/metabolites/HMDB00050"
-    compound.img_abc_id = ""
-    compound.inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
-    compound.inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
-    compound.iupac_name = ""
-    compound.kegg_id = "C00212"
-    compound.kegg_url = "http://www.genome.jp/dbget-bin/www_bget?C00212"
-    compound.last_modified = 1612996604
-    compound.lipidmaps_id = ""
-    compound.lipidmaps_url = ""
-    compound.metacyc_id = "ADENOSINE"
-    compound.mono_isotopic_molecular_weight = 267.096753896
-    compound.name = "adenosine"
-    compound.neutralized_2d_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)"
-    compound.neutralized_2d_inchi_key = "OIRDTQYFTABQOQ-UHFFFAOYSA-N"
-    compound.neutralized_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
-    compound.neutralized_inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
-    compound.num_free_radicals = 0
-    compound.number_components = 1
-    compound.permanent_charge = 0
-    compound.prev_uid = "origin"
-    compound.pubchem_compound_id = "60961"
-    compound.pubchem_url = "http://pubchem.ncbi.nlm.nih.gov/compound/60961"
-    compound.source = "chebi///wikidata///metacyc///gnps///hmdb"
-    compound.synonyms = "adenosine///58-61-7///Adenocard///Adenoscan"  # this value was pruned down
-    compound.unique_id = "1ad02275f47b4033a451e99874f4764f"
-    compound.username = "wjholtz"
-    compound.wikipedia_url = ""
-    return compound
-
-
-@pytest.fixture(name="rt_reference_2")
-def fixture_rt_reference_2():
-    rt_ref = metob.RtReference()
-    rt_ref.creation_time = 1613002857
-    rt_ref.description = "No description"
-    rt_ref.enabled = True
-    rt_ref.head_id = "f74622bcef924f5390ba6e127633e731"
-    rt_ref.last_modified = 1613002980
-    rt_ref.lcms_run = None
-    rt_ref.name = "Untitled"
-    rt_ref.prev_uid = "origin"
-    rt_ref.ref_type = ""
-    rt_ref.rt_max = 3.5233184079926665
-    rt_ref.rt_min = 2.5233184079926665
-    rt_ref.rt_peak = 3.0233184079926665
-    rt_ref.rt_units = "min"
-    rt_ref.unique_id = "f74622bcef924f5390ba6e127633e731"
-    rt_ref.username = "wjholtz"
-    return rt_ref
-
-
-@pytest.fixture(name="mz_reference_2")
-def fixture_mz_reference_2():
-    mz_ref = metob.MzReference()
-    mz_ref.adduct = "[M+H]+"
-    mz_ref.creation_time = 1613002857
-    mz_ref.description = "No description"
-    mz_ref.detected_polarity = "positive"
-    mz_ref.enabled = True
-    mz_ref.head_id = "b0e3cf0df44a4079be7908c6b525d3ac"
-    mz_ref.last_modified = 1613002980
-    mz_ref.lcms_run = None
-    mz_ref.modification = ""
-    mz_ref.mz = 268.1040539
-    mz_ref.mz_tolerance = 20.0
-    mz_ref.mz_tolerance_units = "ppm"
-    mz_ref.name = "Untitled"
-    mz_ref.observed_formula = ""
-    mz_ref.prev_uid = "origin"
-    mz_ref.ref_type = ""
-    mz_ref.unique_id = "b0e3cf0df44a4079be7908c6b525d3ac"
-    mz_ref.username = "wjholtz"
-    return mz_ref
-
-
-@pytest.fixture(name="compound_identification_2")
-def fixture_compound_identification_2(compound_2, rt_reference_2, mz_reference_2):
-    ident = metob.CompoundIdentification()
-    ident.creation_time = 1613002856
-    ident.description = "No description"
-    ident.do_normalization = False
-    ident.frag_references = []
-    ident.head_id = "6cca7aa44c0e4a109f695ba980d69472"
-    ident.identification_grade = None
-    ident.identification_notes = ""
-    ident.intensity_references = []
-    ident.internal_standard_id = ""
-    ident.internal_standard_to_use = ""
-    ident.last_modified = 1613002980
-    ident.ms1_notes = ""
-    ident.ms2_notes = ""
-    ident.name = "adenosine"
-    ident.prev_uid = "origin"
-    ident.unique_id = "6cca7aa44c0e4a109f695ba980d69472"
-    ident.username = "wjholtz"
-    ident.frag_references = []
-    ident.intensity_references = []
-    ident.compound = [compound_2]
-    ident.mz_references = [mz_reference_2]
-    ident.rt_references = [rt_reference_2]
-    return ident
-
-
-@pytest.fixture(name="atlas_with_2_cids")
-def fixture_atlas_with_2_cids(compound_identification, compound_identification_2):
-    small_atlas = metob.Atlas()
-    small_atlas.compound_identifications = [
-        compound_identification,
-        compound_identification_2,
-    ]
-    return small_atlas
-
-
-@pytest.fixture(name="lcmsrun")
-def fixture_lcmsrun():
-    run = metob.LcmsRun()
-    run.unique_id = "7ce51039cfca4426b4e51999ac45d018"
-    run.username = "root"
-    run.hdf5_file = "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5"  # noqa: E501
-    run.description = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
-    run.creation_time = 1605311923
-    run.sample = None
-    run.last_modified = 1620101765
-    run.mzml_file = "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
-    run.prev_uid = "28323058b6e84a9db0f9e802544764e3"
-    run.method = None
-    run.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
-    run.head_id = "7ce51039cfca4426b4e51999ac45d018"
-    run.experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    run.injection_volume = 0.0
-    run.injection_volume_units = "uL"
-    run.acquisition_time = 1604770080
-    run.pass_qc = False
-    return run
-
-
-@pytest.fixture(name="group")
-def fixture_group(lcmsrun):
-    grp = metob.Group()
-    grp.items = [lcmsrun]
-    grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
-    grp.username = "root"
-    grp.description = "No description"
-    grp.creation_time = 1620146477
-    grp.last_modified = 1620146477
-    grp.prev_uid = "origin"
-    grp.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1"
-    grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
-    grp.short_name = "POS_Cone-S1"
-    return grp
-
-
-@pytest.fixture(name="group_with_2_lcmsruns")
-def fixture_group_with_2_lcmsruns(lcmsrun):
-    grp = metob.Group()
-    grp.items = [lcmsrun, lcmsrun]
-    grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
-    grp.username = "root"
-    grp.description = "No description"
-    grp.creation_time = 1620146477
-    grp.last_modified = 1620146477
-    grp.prev_uid = "origin"
-    grp.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1"
-    grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
-    grp.short_name = "POS_Cone-S1"
-    return grp
-
-
-@pytest.fixture(name="hits")
-def fixture_hits():
-    hits_plus = pd.DataFrame(
-        data={
-            "score": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 0.7253785748,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 0.8688691781,
-            },
-            "num_matches": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 6,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 7,
-            },
-            "msv_query_aligned": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
-                    [
-                        None,
-                        None,
-                        None,
-                        None,
-                        56.7212257385,
-                        59.0436058044,
-                        71.0422821045,
-                        73.0214157104,
-                        None,
-                        89.1910018921,
-                        99.0413742065,
-                        104.3592529297,
-                        104.3681869507,
-                        117.0548171997,
-                        None,
-                        118.9432754517,
-                        136.0619506836,
-                        None,
-                        None,
-                        None,
-                        145.9665527344,
-                        163.9772491455,
-                        169.9678497314,
-                        177.1133270264,
-                        187.9771575928,
-                        205.9878387451,
-                        210.9933166504,
-                        229.0038452148,
-                        252.0215606689,
-                        252.1087036133,
-                        252.1572875977,
-                        252.2064666748,
-                    ],
-                    [
-                        None,
-                        None,
-                        None,
-                        None,
-                        3361.7712402344,
-                        6589.943359375,
-                        6501.9853515625,
-                        4987.177734375,
-                        None,
-                        3257.0708007812,
-                        13393.138671875,
-                        3280.0544433594,
-                        4276.0112304688,
-                        57809.1875,
-                        None,
-                        4965.7436523438,
-                        648640.5625,
-                        None,
-                        None,
-                        None,
-                        11511.76171875,
-                        10362.68359375,
-                        5714.70703125,
-                        9354.2353515625,
-                        73409.0078125,
-                        257685.234375,
-                        53554.28125,
-                        193491.515625,
-                        5038.1469726562,
-                        93112.0859375,
-                        7624.11328125,
-                        4599.4125976562,
-                    ],
-                ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
-                    [
-                        None,
-                        50.2002449036,
-                        55.0126533508,
-                        57.0280647278,
-                        None,
-                        None,
-                        None,
-                        None,
-                        68.2973327637,
-                        69.0266494751,
-                        73.0213851929,
-                        None,
-                        74.6972732544,
-                        80.862159729,
-                        82.4692306519,
-                        85.0231246948,
-                        87.0394363403,
-                        92.4544296265,
-                        92.4610061646,
-                        104.3785171509,
-                        115.0390701294,
-                        126.1923675537,
-                        133.0496368408,
-                        136.0618743896,
-                        None,
-                        None,
-                        None,
-                        None,
-                        144.5760345459,
-                        181.1904449463,
-                        230.6756896973,
-                        268.1039733887,
-                    ],
-                    [
-                        None,
-                        87283.4296875,
-                        105163.625,
-                        246350.078125,
-                        None,
-                        None,
-                        None,
-                        None,
-                        81607.3046875,
-                        107886.640625,
-                        150512.90625,
-                        None,
-                        99324.7109375,
-                        80050.4375,
-                        108701.53125,
-                        278198.71875,
-                        95401.265625,
-                        92632.890625,
-                        111341.5625,
-                        119245.7734375,
-                        170358.671875,
-                        103961.4296875,
-                        226297.9375,
-                        48576460.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        98098.609375,
-                        100016.9296875,
-                        119618.1015625,
-                        16002674.0,
-                    ],
-                ],
-            },
-            "msv_ref_aligned": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
-                    [
-                        57.0345,
-                        63.3177,
-                        63.3205,
-                        69.0344,
-                        None,
-                        None,
-                        71.0499,
-                        73.0292,
-                        84.9778,
-                        None,
-                        99.0447,
-                        None,
-                        None,
-                        117.055,
-                        118.059,
-                        None,
-                        136.062,
-                        137.066,
-                        236.709,
-                        253.112,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        252.109,
-                        None,
-                        None,
-                    ],
-                    [
-                        176328.0,
-                        328818.0,
-                        274432.0,
-                        197637.0,
-                        None,
-                        None,
-                        896360.0,
-                        1192020.0,
-                        378547.0,
-                        None,
-                        3921880.0,
-                        None,
-                        None,
-                        15737700.0,
-                        266131.0,
-                        None,
-                        144220000.0,
-                        3455270.0,
-                        185227.0,
-                        1284450.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        20960800.0,
-                        None,
-                        None,
-                    ],
-                ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
-                    [
-                        56.7603,
-                        None,
-                        None,
-                        57.0346,
-                        61.0292,
-                        61.8182,
-                        64.9491,
-                        67.9255,
-                        None,
-                        None,
-                        73.0292,
-                        82.0663,
-                        None,
-                        None,
-                        None,
-                        85.0293,
-                        None,
-                        None,
-                        None,
-                        None,
-                        115.04,
-                        None,
-                        133.05,
-                        136.062,
-                        137.067,
-                        183.555,
-                        230.198,
-                        269.108,
-                        None,
-                        None,
-                        None,
-                        268.105,
-                    ],
-                    [
-                        35523.7,
-                        None,
-                        None,
-                        184839.0,
-                        43216.2,
-                        40066.3,
-                        40362.0,
-                        41550.6,
-                        None,
-                        None,
-                        93791.1,
-                        293258.0,
-                        None,
-                        None,
-                        None,
-                        202756.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        184050.0,
-                        None,
-                        364543.0,
-                        29646700.0,
-                        830130.0,
-                        51455.4,
-                        51206.7,
-                        970064.0,
-                        None,
-                        None,
-                        None,
-                        12412800.0,
-                    ],
-                ],
-            },
-            "name": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "2'-deoxyadenosine",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "adenosine",
-            },
-            "adduct": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "[M+H]+",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "[M+H]+",
-            },
-            "inchi_key": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "OIRDTQYFTABQOQ-KQYNXXCUSA-N",
-            },
-            "precursor_mz": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.1091393,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.1040539,
-            },
-            "measured_precursor_mz": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.10887146,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.103729248,
-            },
-            "measured_precursor_intensity": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 2872807.5,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 75979424.0,
-            },
-            "copy_index": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
-                    "metatlas",
-                    "c7dddd297e104ca79caea72a90150532",
-                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
-                    2.2203779221,
-                ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
-                    "metatlas",
-                    "cf5e8df145f64bf0856fbf852d1bdb64",
-                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5",
-                    3.0264527798,
-                ],
-            },
-        }
-    )
-    hits_plus.index = pd.MultiIndex.from_tuples(
-        hits_plus["copy_index"], names=["database", "id", "file_name", "msms_scan"]
-    )
-    hits_plus.drop(columns=["copy_index"], inplace=True)
-    return hits_plus
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 5c4757c3..122316b5 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -1,13 +1,1455 @@
 """
 per-directory pytest configuration
-This makes the fixtures available to tests within this directory
+fixtures used across multiple files should go in here
 """
-from glob import glob
+# pylint: disable=missing-function-docstring,unused-argument
 
+import getpass
+import os
+import sqlite3
 
-def refactor(string: str) -> str:
-    """python file path to module name converter"""
-    return string.replace("/", ".").replace("\\", ".").replace(".py", "")
+import pytest
+import numpy as np
+import pandas as pd
 
+from metatlas.datastructures import metatlas_dataset as mads
+from metatlas.datastructures import metatlas_objects as metob
 
-pytest_plugins = [refactor(fixture) for fixture in glob("tests/fixtures/*.py") if "__" not in fixture]
+
+@pytest.fixture(name="username", scope="session")
+def fixture_username():
+    return getpass.getuser()
+
+
+@pytest.fixture(name="analysis_ids")
+def fixture_analysis_ids(tmp_path, sqlite_with_atlas, username):
+    return mads.AnalysisIdentifiers(
+        f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        "FinalEMA-HILIC",
+        "positive",
+        0,
+        str(tmp_path),
+    )
+
+
+@pytest.fixture(name="analysis_ids_with_2_cids")
+def fixture_analysis_ids_with_2_cids(tmp_path, sqlite_with_atlas_with_2_cids, username):
+    return mads.AnalysisIdentifiers(
+        f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1",
+        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        "FinalEMA-HILIC",
+        "positive",
+        0,
+        str(tmp_path),
+    )
+
+
+@pytest.fixture(name="sqlite")
+def fixture_sqlite(username):
+    sqlite3.connect(f"{username}_workspace.db").close()
+    metob.store(metob.Atlas())
+    metob.store(metob.CompoundIdentification())
+    metob.store(metob.Compound())
+    metob.store(metob.MzReference())
+    metob.store(metob.RtReference())
+    metob.store(metob.LcmsRun())
+
+
+@pytest.fixture(name="sqlite_with_atlas")
+def fixture_sqlite_with_atlas(sqlite, atlas, username):
+    atlas.name = f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0"
+    metob.store(atlas)
+
+
+@pytest.fixture(name="sqlite_with_atlas_with_2_cids")
+def fixture_sqlite_with_atlas_with_2_cids(sqlite, atlas_with_2_cids, username):
+    atlas_with_2_cids.name = f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1"
+    metob.store(atlas_with_2_cids)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def change_test_dir(request, tmp_path):
+    os.chdir(tmp_path)
+    yield
+    os.chdir(request.config.invocation_dir)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def set_env_vars(monkeypatch):
+    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
+
+
+@pytest.fixture(name="ms1_pos")
+def fixture_ms1_pos():
+    return pd.DataFrame(
+        data={
+            "mz": {
+                "0": 252.1089324951,
+                "1": 252.1090087891,
+                "2": 252.1088104248,
+                "3": 252.1090087891,
+                "4": 252.10887146,
+                "5": 252.1089324951,
+                "6": 252.1089324951,
+                "7": 252.1088256836,
+                "8": 252.1088867188,
+                "9": 252.1090393066,
+                "10": 252.1089782715,
+                "11": 252.1089630127,
+                "12": 252.1089630127,
+                "13": 252.1089782715,
+                "14": 252.1090240479,
+                "15": 252.1089782715,
+                "16": 252.1090240479,
+                "17": 252.1089324951,
+                "18": 252.1090393066,
+                "19": 252.1088867188,
+                "20": 252.10887146,
+                "21": 252.1089324951,
+                "22": 252.1089630127,
+                "23": 252.1089935303,
+                "24": 252.1089172363,
+                "25": 252.1089477539,
+                "26": 252.1090545654,
+                "27": 252.1089630127,
+                "28": 252.1090240479,
+                "29": 252.1090087891,
+                "30": 252.1090393066,
+                "31": 252.1090240479,
+                "32": 252.1089935303,
+                "33": 252.1090240479,
+                "34": 252.1089630127,
+                "35": 252.1090087891,
+                "36": 252.1090240479,
+                "37": 252.1089172363,
+                "38": 252.1089019775,
+                "39": 252.1089477539,
+                "40": 252.1089324951,
+                "41": 252.1089477539,
+                "42": 252.1089477539,
+                "43": 252.1089477539,
+                "44": 252.1089782715,
+                "45": 252.1088867188,
+                "46": 252.1089172363,
+                "47": 252.1089324951,
+                "48": 252.1089782715,
+                "49": 252.1089477539,
+                "50": 252.1089172363,
+                "51": 252.1089324951,
+                "52": 252.1089630127,
+                "53": 252.1088867188,
+                "54": 252.1089630127,
+                "55": 252.1085205078,
+                "56": 252.1090545654,
+                "57": 252.1089935303,
+                "58": 252.1088104248,
+                "59": 252.1086578369,
+                "60": 252.1089935303,
+                "61": 252.1085510254,
+                "62": 252.1082763672,
+                "63": 252.1082458496,
+                "64": 252.1084136963,
+                "65": 252.1092224121,
+                "66": 252.1091766357,
+                "67": 252.1092834473,
+                "68": 252.1087493896,
+                "69": 252.1112518311,
+                "70": 252.1088409424,
+                "71": 252.1086425781,
+                "72": 252.1091766357,
+                "73": 252.1094055176,
+            },
+            "i": {
+                "0": 312203.5,
+                "1": 387914.59375,
+                "2": 308308.5,
+                "3": 334653.59375,
+                "4": 339521.625,
+                "5": 345527.21875,
+                "6": 292437.34375,
+                "7": 413614.53125,
+                "8": 300285.28125,
+                "9": 383848.71875,
+                "10": 404313.21875,
+                "11": 377231.34375,
+                "12": 453965.5625,
+                "13": 431327.0,
+                "14": 523180.0625,
+                "15": 510239.8125,
+                "16": 631459.1875,
+                "17": 807419.5,
+                "18": 842647.5625,
+                "19": 1053031.625,
+                "20": 1082361.625,
+                "21": 1198966.625,
+                "22": 1109162.375,
+                "23": 1126347.125,
+                "24": 1373071.5,
+                "25": 1589018.375,
+                "26": 1281309.875,
+                "27": 1660166.75,
+                "28": 1492912.25,
+                "29": 2029801.5,
+                "30": 2029874.125,
+                "31": 2035966.625,
+                "32": 2010867.875,
+                "33": 2036981.375,
+                "34": 2148879.25,
+                "35": 2359861.25,
+                "36": 2054066.125,
+                "37": 1691976.0,
+                "38": 1778159.125,
+                "39": 1776166.125,
+                "40": 1752154.125,
+                "41": 1575676.875,
+                "42": 1199910.625,
+                "43": 1259708.25,
+                "44": 1087384.375,
+                "45": 826077.125,
+                "46": 802296.875,
+                "47": 547785.125,
+                "48": 545340.0625,
+                "49": 584624.4375,
+                "50": 468524.8125,
+                "51": 305931.1875,
+                "52": 330310.34375,
+                "53": 309740.625,
+                "54": 289212.71875,
+                "55": 230440.9375,
+                "56": 210549.390625,
+                "57": 169972.390625,
+                "58": 140521.234375,
+                "59": 116637.953125,
+                "60": 117197.625,
+                "61": 84652.1171875,
+                "62": 117615.578125,
+                "63": 103500.921875,
+                "64": 89320.9453125,
+                "65": 76313.9296875,
+                "66": 55575.00390625,
+                "67": 76784.6796875,
+                "68": 28829.162109375,
+                "69": 26051.6171875,
+                "70": 42957.18359375,
+                "71": 50342.6953125,
+                "72": 37611.33984375,
+                "73": 38202.83203125,
+            },
+            "rt": {
+                "0": 2.1030805111,
+                "1": 2.1084616184,
+                "2": 2.1139531136,
+                "3": 2.1193552017,
+                "4": 2.1248509884,
+                "5": 2.1302509308,
+                "6": 2.135682106,
+                "7": 2.1411821842,
+                "8": 2.1459801197,
+                "9": 2.1513926983,
+                "10": 2.1568279266,
+                "11": 2.1622362137,
+                "12": 2.1676549911,
+                "13": 2.1730883121,
+                "14": 2.179015398,
+                "15": 2.1845297813,
+                "16": 2.1900422573,
+                "17": 2.1949694157,
+                "18": 2.20002985,
+                "19": 2.2055358887,
+                "20": 2.2110378742,
+                "21": 2.2165191174,
+                "22": 2.2219588757,
+                "23": 2.2273921967,
+                "24": 2.2328462601,
+                "25": 2.2382712364,
+                "26": 2.2437169552,
+                "27": 2.2492566109,
+                "28": 2.2547125816,
+                "29": 2.2601687908,
+                "30": 2.2656960487,
+                "31": 2.2704958916,
+                "32": 2.2758042812,
+                "33": 2.2813498974,
+                "34": 2.2868082523,
+                "35": 2.2922415733,
+                "36": 2.2976748943,
+                "37": 2.3031060696,
+                "38": 2.308131218,
+                "39": 2.313628912,
+                "40": 2.3185498714,
+                "41": 2.3239560127,
+                "42": 2.3293914795,
+                "43": 2.3349123001,
+                "44": 2.3403663635,
+                "45": 2.346799612,
+                "46": 2.3522267342,
+                "47": 2.3576600552,
+                "48": 2.3631224632,
+                "49": 2.3685662746,
+                "50": 2.3740911484,
+                "51": 2.3794057369,
+                "52": 2.3848536015,
+                "53": 2.3903660774,
+                "54": 2.3953785896,
+                "55": 2.4006638527,
+                "56": 2.4062638283,
+                "57": 2.411709547,
+                "58": 2.4171659946,
+                "59": 2.4226117134,
+                "60": 2.4302260876,
+                "61": 2.4357616901,
+                "62": 2.4407405853,
+                "63": 2.4461927414,
+                "64": 2.451615572,
+                "65": 2.4571509361,
+                "66": 2.4627010822,
+                "67": 2.4681572914,
+                "68": 2.4735822678,
+                "69": 2.4735822678,
+                "70": 2.4787945747,
+                "71": 2.4842174053,
+                "72": 2.4896612167,
+                "73": 2.495146513,
+            },
+            "polarity": {
+                "0": 1,
+                "1": 1,
+                "2": 1,
+                "3": 1,
+                "4": 1,
+                "5": 1,
+                "6": 1,
+                "7": 1,
+                "8": 1,
+                "9": 1,
+                "10": 1,
+                "11": 1,
+                "12": 1,
+                "13": 1,
+                "14": 1,
+                "15": 1,
+                "16": 1,
+                "17": 1,
+                "18": 1,
+                "19": 1,
+                "20": 1,
+                "21": 1,
+                "22": 1,
+                "23": 1,
+                "24": 1,
+                "25": 1,
+                "26": 1,
+                "27": 1,
+                "28": 1,
+                "29": 1,
+                "30": 1,
+                "31": 1,
+                "32": 1,
+                "33": 1,
+                "34": 1,
+                "35": 1,
+                "36": 1,
+                "37": 1,
+                "38": 1,
+                "39": 1,
+                "40": 1,
+                "41": 1,
+                "42": 1,
+                "43": 1,
+                "44": 1,
+                "45": 1,
+                "46": 1,
+                "47": 1,
+                "48": 1,
+                "49": 1,
+                "50": 1,
+                "51": 1,
+                "52": 1,
+                "53": 1,
+                "54": 1,
+                "55": 1,
+                "56": 1,
+                "57": 1,
+                "58": 1,
+                "59": 1,
+                "60": 1,
+                "61": 1,
+                "62": 1,
+                "63": 1,
+                "64": 1,
+                "65": 1,
+                "66": 1,
+                "67": 1,
+                "68": 1,
+                "69": 1,
+                "70": 1,
+                "71": 1,
+                "72": 1,
+                "73": 1,
+            },
+        }
+    )
+
+
+@pytest.fixture(name="ms2_pos")
+def fixture_ms2_pos():
+    return pd.DataFrame(
+        data={
+            "mz": {
+                "0": 252.1081695557,
+                "1": 252.1564941406,
+                "2": 252.1087036133,
+                "3": 252.1572875977,
+                "4": 252.1089019775,
+                "5": 252.1550292969,
+                "6": 252.1090698242,
+                "7": 252.1557617188,
+            },
+            "i": {
+                "0": 32103.3515625,
+                "1": 6470.0009765625,
+                "2": 93112.0859375,
+                "3": 7624.11328125,
+                "4": 131062.0,
+                "5": 6535.4560546875,
+                "6": 76976.7265625,
+                "7": 6090.6440429688,
+            },
+            "rt": {
+                "0": 2.0097544193,
+                "1": 2.0097544193,
+                "2": 2.2203779221,
+                "3": 2.2203779221,
+                "4": 2.327804327,
+                "5": 2.327804327,
+                "6": 2.3452186584,
+                "7": 2.3452186584,
+            },
+            "polarity": {"0": 1, "1": 1, "2": 1, "3": 1, "4": 1, "5": 1, "6": 1, "7": 1},
+            "precursor_MZ": {
+                "0": 252.0195159912,
+                "1": 252.0195159912,
+                "2": 252.10887146,
+                "3": 252.10887146,
+                "4": 252.0194854736,
+                "5": 252.0194854736,
+                "6": 252.1089477539,
+                "7": 252.1089477539,
+            },
+            "precursor_intensity": {
+                "0": 2748235.5,
+                "1": 2748235.5,
+                "2": 2872807.5,
+                "3": 2872807.5,
+                "4": 3536752.25,
+                "5": 3536752.25,
+                "6": 3046732.75,
+                "7": 3046732.75,
+            },
+            "collision_energy": {
+                "0": 23.3333339691,
+                "1": 23.3333339691,
+                "2": 23.3333339691,
+                "3": 23.3333339691,
+                "4": 23.3333339691,
+                "5": 23.3333339691,
+                "6": 23.3333339691,
+                "7": 23.3333339691,
+            },
+        }
+    )
+
+
+@pytest.fixture(name="ms1_neg_empty")
+def fixture_ms1_neg_empty():
+    return pd.DataFrame(data={"mz": {}, "i": {}, "rt": {}, "polarity": {}})
+
+
+@pytest.fixture(name="ms2_neg_empty")
+def fixture_ms2_neg_empty():
+    return pd.DataFrame(
+        data={
+            "mz": {},
+            "i": {},
+            "rt": {},
+            "polarity": {},
+            "precursor_MZ": {},
+            "precursor_intensity": {},
+            "collision_energy": {},
+        }
+    )
+
+
+@pytest.fixture(name="df_container")
+def fixture_df_container(ms1_pos, ms2_pos, ms1_neg_empty, ms2_neg_empty):
+    return {"ms1_neg": ms1_neg_empty, "ms1_pos": ms1_pos, "ms2_neg": ms2_neg_empty, "ms2_pos": ms2_pos}
+
+
+@pytest.fixture(name="ms1_summary")
+def fixture_ms1_summary():
+    return {
+        "num_ms1_datapoints": 85.0,
+        "mz_peak": 252.1092987060547,
+        "rt_peak": 2.2775044441223145,
+        "mz_centroid": 252.10915042669814,
+        "rt_centroid": 2.218492414487913,
+        "peak_height": 304761.90625,
+        "peak_area": 7696977.46875,
+    }
+
+
+@pytest.fixture(name="msms")
+def fixture_msms():
+    return {
+        "data": {
+            "mz": np.array([], dtype=np.float64),
+            "i": np.array([], dtype=np.float64),
+            "rt": np.array([], dtype=np.float64),
+            "polarity": np.array([], dtype=np.float64),
+            "precursor_MZ": np.array([], dtype=np.float64),
+            "precursor_intensity": np.array([], dtype=np.float64),
+            "collision_energy": np.array([], dtype=np.float64),
+        }
+    }
+
+
+@pytest.fixture(name="groups_controlled_vocab")
+def fixture_groups_controlled_vocab():
+    return ["QC", "InjBl", "ISTD"]
+
+
+@pytest.fixture(name="metatlas_dataset")
+def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
+
+
+@pytest.fixture(name="metatlas_dataset_with_2_cids")
+def fixture_metatlas_dataset_with_2_cids(
+    mocker, df_container, analysis_ids_with_2_cids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas_with_2_cids
+):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    return mads.MetatlasDataset(analysis_ids_with_2_cids, groups_controlled_vocab)
+
+
+@pytest.fixture(name="eic")
+def fixture_eic():
+    return {
+        "mz": [
+            252.1089324951172,
+            252.10943603515625,
+            252.10926818847656,
+            252.109375,
+            252.10923767089844,
+            252.10910034179688,
+            252.10914611816406,
+            252.1089630126953,
+            252.10971069335938,
+            252.1093292236328,
+            252.10934448242188,
+            252.109130859375,
+            252.10935974121094,
+            252.10939025878906,
+            252.1090545654297,
+            252.10916137695312,
+            252.10946655273438,
+            252.10923767089844,
+            252.1093292236328,
+            252.10919189453125,
+            252.10914611816406,
+            252.10897827148438,
+            252.10934448242188,
+            252.10928344726562,
+            252.10888671875,
+            252.10926818847656,
+            252.109130859375,
+            252.1090087890625,
+            252.10934448242188,
+            252.10939025878906,
+            252.1093292236328,
+            252.1091766357422,
+            252.109130859375,
+            252.1095428466797,
+            252.10890197753906,
+            252.1095428466797,
+            252.109130859375,
+            252.10911560058594,
+            252.1091766357422,
+            252.1088409423828,
+            252.10916137695312,
+            252.10935974121094,
+            252.10928344726562,
+            252.10922241210938,
+            252.10914611816406,
+            252.10922241210938,
+            252.10894775390625,
+            252.10906982421875,
+            252.10914611816406,
+            252.10916137695312,
+            252.10910034179688,
+            252.10916137695312,
+            252.10934448242188,
+            252.10899353027344,
+            252.10928344726562,
+            252.10897827148438,
+            252.10916137695312,
+            252.10928344726562,
+            252.1092987060547,
+            252.1089324951172,
+            252.10914611816406,
+            252.1090545654297,
+            252.10914611816406,
+            252.1090850830078,
+            252.10894775390625,
+            252.10914611816406,
+            252.10911560058594,
+            252.1090850830078,
+            252.109130859375,
+            252.10903930664062,
+            252.10890197753906,
+            252.109130859375,
+            252.10885620117188,
+            252.10914611816406,
+            252.10926818847656,
+            252.10888671875,
+            252.109619140625,
+            252.10922241210938,
+            252.1092529296875,
+            252.1099853515625,
+            252.10972595214844,
+            252.10910034179688,
+            252.10935974121094,
+            252.1088409423828,
+            252.10838317871094,
+            252.11212158203125,
+        ],
+        "rt": [
+            1.7180122137069702,
+            1.8222843408584595,
+            1.838305115699768,
+            1.8444031476974487,
+            1.8705799579620361,
+            1.875998616218567,
+            1.8913277387619019,
+            1.9020838737487793,
+            1.9127358198165894,
+            1.9397128820419312,
+            1.9451169967651367,
+            1.9505127668380737,
+            1.955920934677124,
+            1.966427206993103,
+            1.9718105792999268,
+            1.9769750833511353,
+            1.9823375940322876,
+            1.987752079963684,
+            1.9932082891464233,
+            1.9986457824707031,
+            2.0094456672668457,
+            2.019866466522217,
+            2.030582904815674,
+            2.036003589630127,
+            2.0568389892578125,
+            2.062201499938965,
+            2.0675911903381348,
+            2.0834577083587646,
+            2.088857650756836,
+            2.0939910411834717,
+            2.099109649658203,
+            2.104536771774292,
+            2.1208388805389404,
+            2.1262447834014893,
+            2.1420176029205322,
+            2.152921676635742,
+            2.15836763381958,
+            2.163788318634033,
+            2.169198751449585,
+            2.1755259037017822,
+            2.180954933166504,
+            2.18635892868042,
+            2.191038131713867,
+            2.1964569091796875,
+            2.2018840312957764,
+            2.2069132328033447,
+            2.21236515045166,
+            2.2177650928497314,
+            2.2228589057922363,
+            2.2283151149749756,
+            2.2338151931762695,
+            2.239321231842041,
+            2.244842052459717,
+            2.250317096710205,
+            2.255610704421997,
+            2.261033535003662,
+            2.2665293216705322,
+            2.2720251083374023,
+            2.2775044441223145,
+            2.28295636177063,
+            2.288454294204712,
+            2.29386043548584,
+            2.299298048019409,
+            2.304720878601074,
+            2.310127019882202,
+            2.3155603408813477,
+            2.320981025695801,
+            2.326420545578003,
+            2.33160400390625,
+            2.3370935916900635,
+            2.3428516387939453,
+            2.3483099937438965,
+            2.3535475730895996,
+            2.3589975833892822,
+            2.364443302154541,
+            2.3699119091033936,
+            2.375347375869751,
+            2.3808369636535645,
+            2.3862972259521484,
+            2.3917577266693115,
+            2.397282600402832,
+            2.402780294418335,
+            2.4081971645355225,
+            2.419055461883545,
+            2.457223892211914,
+            3.3080079555511475,
+        ],
+        "intensity": [
+            34249.71484375,
+            28511.658203125,
+            41718.13671875,
+            33448.546875,
+            40190.94140625,
+            32525.16015625,
+            37058.60546875,
+            51132.91015625,
+            36473.0546875,
+            42659.0859375,
+            45187.6171875,
+            51186.30078125,
+            58456.5859375,
+            43299.24609375,
+            52062.02734375,
+            42501.8671875,
+            39734.91015625,
+            41848.02734375,
+            48979.640625,
+            42957.48046875,
+            54214.27734375,
+            63583.64453125,
+            38661.046875,
+            47146.54296875,
+            36974.3046875,
+            37674.35546875,
+            37412.4609375,
+            47036.44921875,
+            32295.888671875,
+            39751.12109375,
+            47359.0,
+            57496.41796875,
+            33690.4765625,
+            36853.53515625,
+            33045.0703125,
+            33235.64453125,
+            52481.1015625,
+            48210.37109375,
+            62178.734375,
+            73049.2109375,
+            52741.03125,
+            88225.1953125,
+            101593.296875,
+            127965.625,
+            124079.859375,
+            134410.46875,
+            148749.0,
+            134068.8125,
+            141625.515625,
+            202721.015625,
+            204341.703125,
+            172160.484375,
+            185859.765625,
+            195729.234375,
+            216657.453125,
+            239248.65625,
+            172232.296875,
+            195105.046875,
+            304761.90625,
+            181052.265625,
+            222467.5625,
+            251571.53125,
+            205874.765625,
+            224279.0625,
+            173697.359375,
+            236325.078125,
+            153999.28125,
+            156835.59375,
+            118963.8046875,
+            105766.234375,
+            103081.484375,
+            97180.5625,
+            95681.4140625,
+            74239.0703125,
+            69208.8984375,
+            60604.1484375,
+            37020.84765625,
+            32874.484375,
+            24641.875,
+            23305.75,
+            23413.94140625,
+            42582.77734375,
+            35980.16796875,
+            25743.97265625,
+            21777.99609375,
+            59454.40234375,
+        ],
+    }
+
+
+@pytest.fixture(name="atlas_df")
+def fixture_atlas_df(metatlas_dataset):
+    return metatlas_dataset.atlas_df
+
+
+@pytest.fixture(name="compound")
+def fixture_compound(username):
+    compound = metob.Compound()
+    compound.unique_id = "60cd6743e56545c6a6cb066ec3553450"
+    compound.mono_isotopic_molecular_weight = 251.101839276
+    compound.creation_time = 1466212395
+    compound.synonyms = "2'-deoxyadenosine"  # value was pruned down
+    compound.inchi_key = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+    compound.chebi_url = "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"
+    compound.permanent_charge = 0
+    compound.img_abc_id = ""
+    compound.neutralized_2d_inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"  # noqa: E501
+    compound.lipidmaps_url = ""
+    compound.source = "gnps///chebi///metacyc///hmdb"
+    compound.kegg_url = "http://www.genome.jp/dbget-bin/www_bget?C00559"
+    compound.hmdb_url = "http://www.hmdb.ca/metabolites/HMDB00101"
+    compound.wikipedia_url = ""
+    compound.head_id = "60cd6743e56545c6a6cb066ec3553450"
+    compound.formula = "C10H13N5O3"
+    compound.number_components = 1
+    compound.iupac_name = ""
+    compound.username = username
+    compound.pubchem_compound_id = "13730"
+    compound.description = "A purine 2'-deoxyribonucleoside having adenine as the nucleobase."
+    compound.metacyc_id = "DEOXYADENOSINE"
+    compound.kegg_id = "C00559"
+    compound.hmdb_id = "HMDB00101"
+    compound.chebi_id = "CHEBI:17256"
+    compound.inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"  # noqa: E501
+    compound.neutralized_inchi_key = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+    compound.prev_uid = "origin"
+    compound.neutralized_inchi = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"  # noqa: E501
+    compound.name = "2'-deoxyadenosine"
+    compound.neutralized_2d_inchi_key = "OLXZPDWKRNYJJZ-UHFFFAOYSA-N"
+    compound.num_free_radicals = 0
+    compound.lipidmaps_id = ""
+    compound.last_modified = 1612996604
+    compound.pubchem_url = "http://pubchem.ncbi.nlm.nih.gov/compound/13730"
+    return compound
+
+
+@pytest.fixture(name="rt_reference")
+def fixture_rt_reference(username):
+    rt_ref = metob.RtReference()
+    rt_ref.unique_id = "a845ddfdf8ef4713bcef3bdb84999030"
+    rt_ref.username = username
+    rt_ref.rt_units = "min"
+    rt_ref.description = "No description"
+    rt_ref.rt_peak = "2.1964640053707174"
+    rt_ref.enabled = True
+    rt_ref.creation_time = 1613002850
+    rt_ref.lcms_run = None
+    rt_ref.rt_min = 1.6964640053707174
+    rt_ref.last_modified = 1613002979
+    rt_ref.ref_type = ""
+    rt_ref.prev_uid = "origin"
+    rt_ref.rt_max = 2.6964640053707174
+    rt_ref.name = "Untitled"
+    rt_ref.head_id = "a845ddfdf8ef4713bcef3bdb84999030"
+    return rt_ref
+
+
+@pytest.fixture(name="mz_reference")
+def fixture_mz_reference(username):
+    mz_ref = metob.MzReference()
+    mz_ref.unique_id = "eb6d03c9ef574051b92dad7b2fc259a2"
+    mz_ref.username = username
+    mz_ref.adduct = "[M+H]+"
+    mz_ref.description = "No description"
+    mz_ref.mz_tolerance_units = "ppm"
+    mz_ref.enabled = True
+    mz_ref.mz = 252.1091393
+    mz_ref.creation_time = 1613002850
+    mz_ref.lcms_run = None
+    mz_ref.mz_tolerance = 20.0
+    mz_ref.last_modified = 1613002979
+    mz_ref.detected_polarity = "positive"
+    mz_ref.modification = ""
+    mz_ref.ref_type = ""
+    mz_ref.observed_formula = ""
+    mz_ref.prev_uid = "origin"
+    mz_ref.name = "Untitled"
+    mz_ref.head_id = "eb6d03c9ef574051b92dad7b2fc259a2"
+    return mz_ref
+
+
+@pytest.fixture(name="compound_identification")
+def fixture_compound_identification(compound, rt_reference, mz_reference, username):
+    ident = metob.CompoundIdentification()
+    ident.unique_id = "18737c7141cc4efaa4545bead13ac751"
+    ident.username = username
+    ident.description = "No description"
+    ident.creation_time = 1613002849
+    ident.last_modified = 1613002979
+    ident.identification_grade = None
+    ident.prev_uid = "origin"
+    ident.name = "2'-deoxyadenosine"
+    ident.head_id = "18737c7141cc4efaa4545bead13ac751"
+    ident.internal_standard_to_use = ""
+    ident.internal_standard_id = ""
+    ident.do_normalization = False
+    ident.identification_notes = "my id note"
+    ident.ms2_notes = "bad match to ref"
+    ident.ms1_notes = "keep"
+    ident.frag_references = []
+    ident.intensity_references = []
+    ident.compound = [compound]
+    ident.mz_references = [mz_reference]
+    ident.rt_references = [rt_reference]
+    return ident
+
+
+@pytest.fixture(name="atlas")
+def fixture_atlas(compound_identification):
+    small_atlas = metob.Atlas()
+    small_atlas.compound_identifications = [compound_identification]
+    return small_atlas
+
+
+@pytest.fixture(name="compound_2")
+def fixture_compound_2(username):
+    compound = metob.Compound()
+    compound.chebi_id = "CHEBI:16335"
+    compound.chebi_url = "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:16335"
+    compound.creation_time = 1466212384
+    compound.description = "A ribonucleoside composed of a molecule of adenine attached to a ribofuranose moiety via a beta1N9-glycosidic bond."
+    compound.formula = "C10H13N5O4"
+    compound.head_id = "1ad02275f47b4033a451e99874f4764f"
+    compound.hmdb_id = "HMDB00050"
+    compound.hmdb_url = "http://www.hmdb.ca/metabolites/HMDB00050"
+    compound.img_abc_id = ""
+    compound.inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
+    compound.inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
+    compound.iupac_name = ""
+    compound.kegg_id = "C00212"
+    compound.kegg_url = "http://www.genome.jp/dbget-bin/www_bget?C00212"
+    compound.last_modified = 1612996604
+    compound.lipidmaps_id = ""
+    compound.lipidmaps_url = ""
+    compound.metacyc_id = "ADENOSINE"
+    compound.mono_isotopic_molecular_weight = 267.096753896
+    compound.name = "adenosine"
+    compound.neutralized_2d_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)"
+    compound.neutralized_2d_inchi_key = "OIRDTQYFTABQOQ-UHFFFAOYSA-N"
+    compound.neutralized_inchi = "InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1"
+    compound.neutralized_inchi_key = "OIRDTQYFTABQOQ-KQYNXXCUSA-N"
+    compound.num_free_radicals = 0
+    compound.number_components = 1
+    compound.permanent_charge = 0
+    compound.prev_uid = "origin"
+    compound.pubchem_compound_id = "60961"
+    compound.pubchem_url = "http://pubchem.ncbi.nlm.nih.gov/compound/60961"
+    compound.source = "chebi///wikidata///metacyc///gnps///hmdb"
+    compound.synonyms = "adenosine///58-61-7///Adenocard///Adenoscan"  # this value was pruned down
+    compound.unique_id = "1ad02275f47b4033a451e99874f4764f"
+    compound.username = username
+    compound.wikipedia_url = ""
+    return compound
+
+
+@pytest.fixture(name="rt_reference_2")
+def fixture_rt_reference_2(username):
+    rt_ref = metob.RtReference()
+    rt_ref.creation_time = 1613002857
+    rt_ref.description = "No description"
+    rt_ref.enabled = True
+    rt_ref.head_id = "f74622bcef924f5390ba6e127633e731"
+    rt_ref.last_modified = 1613002980
+    rt_ref.lcms_run = None
+    rt_ref.name = "Untitled"
+    rt_ref.prev_uid = "origin"
+    rt_ref.ref_type = ""
+    rt_ref.rt_max = 3.5233184079926665
+    rt_ref.rt_min = 2.5233184079926665
+    rt_ref.rt_peak = 3.0233184079926665
+    rt_ref.rt_units = "min"
+    rt_ref.unique_id = "f74622bcef924f5390ba6e127633e731"
+    rt_ref.username = username
+    return rt_ref
+
+
+@pytest.fixture(name="mz_reference_2")
+def fixture_mz_reference_2(username):
+    mz_ref = metob.MzReference()
+    mz_ref.adduct = "[M+H]+"
+    mz_ref.creation_time = 1613002857
+    mz_ref.description = "No description"
+    mz_ref.detected_polarity = "positive"
+    mz_ref.enabled = True
+    mz_ref.head_id = "b0e3cf0df44a4079be7908c6b525d3ac"
+    mz_ref.last_modified = 1613002980
+    mz_ref.lcms_run = None
+    mz_ref.modification = ""
+    mz_ref.mz = 268.1040539
+    mz_ref.mz_tolerance = 20.0
+    mz_ref.mz_tolerance_units = "ppm"
+    mz_ref.name = "Untitled"
+    mz_ref.observed_formula = ""
+    mz_ref.prev_uid = "origin"
+    mz_ref.ref_type = ""
+    mz_ref.unique_id = "b0e3cf0df44a4079be7908c6b525d3ac"
+    mz_ref.username = username
+    return mz_ref
+
+
+@pytest.fixture(name="compound_identification_2")
+def fixture_compound_identification_2(compound_2, rt_reference_2, mz_reference_2, username):
+    ident = metob.CompoundIdentification()
+    ident.creation_time = 1613002856
+    ident.description = "No description"
+    ident.do_normalization = False
+    ident.frag_references = []
+    ident.head_id = "6cca7aa44c0e4a109f695ba980d69472"
+    ident.identification_grade = None
+    ident.identification_notes = ""
+    ident.intensity_references = []
+    ident.internal_standard_id = ""
+    ident.internal_standard_to_use = ""
+    ident.last_modified = 1613002980
+    ident.ms1_notes = ""
+    ident.ms2_notes = ""
+    ident.name = "adenosine"
+    ident.prev_uid = "origin"
+    ident.unique_id = "6cca7aa44c0e4a109f695ba980d69472"
+    ident.username = username
+    ident.frag_references = []
+    ident.intensity_references = []
+    ident.compound = [compound_2]
+    ident.mz_references = [mz_reference_2]
+    ident.rt_references = [rt_reference_2]
+    return ident
+
+
+@pytest.fixture(name="atlas_with_2_cids")
+def fixture_atlas_with_2_cids(compound_identification, compound_identification_2):
+    small_atlas = metob.Atlas()
+    small_atlas.compound_identifications = [
+        compound_identification,
+        compound_identification_2,
+    ]
+    return small_atlas
+
+
+@pytest.fixture(name="lcmsrun")
+def fixture_lcmsrun(username):
+    run = metob.LcmsRun()
+    run.unique_id = "7ce51039cfca4426b4e51999ac45d018"
+    run.username = username
+    run.hdf5_file = "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5"  # noqa: E501
+    run.description = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
+    run.creation_time = 1605311923
+    run.sample = None
+    run.last_modified = 1620101765
+    run.mzml_file = "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
+    run.prev_uid = "28323058b6e84a9db0f9e802544764e3"
+    run.method = None
+    run.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML"  # noqa: E501
+    run.head_id = "7ce51039cfca4426b4e51999ac45d018"
+    run.experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
+    run.injection_volume = 0.0
+    run.injection_volume_units = "uL"
+    run.acquisition_time = 1604770080
+    run.pass_qc = False
+    return run
+
+
+@pytest.fixture(name="group")
+def fixture_group(lcmsrun, username):
+    grp = metob.Group()
+    grp.items = [lcmsrun]
+    grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.username = username
+    grp.description = "No description"
+    grp.creation_time = 1620146477
+    grp.last_modified = 1620146477
+    grp.prev_uid = "origin"
+    grp.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.short_name = "POS_Cone-S1"
+    return grp
+
+
+@pytest.fixture(name="group_with_2_lcmsruns")
+def fixture_group_with_2_lcmsruns(lcmsrun, username):
+    grp = metob.Group()
+    grp.items = [lcmsrun, lcmsrun]
+    grp.unique_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.username = username
+    grp.description = "No description"
+    grp.creation_time = 1620146477
+    grp.last_modified = 1620146477
+    grp.prev_uid = "origin"
+    grp.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
+    grp.short_name = "POS_Cone-S1"
+    return grp
+
+
+@pytest.fixture(name="hits")
+def fixture_hits():
+    hits_plus = pd.DataFrame(
+        data={
+            "score": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 0.7253785748,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 0.8688691781,
+            },
+            "num_matches": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 6,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 7,
+            },
+            "msv_query_aligned": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    [
+                        None,
+                        None,
+                        None,
+                        None,
+                        56.7212257385,
+                        59.0436058044,
+                        71.0422821045,
+                        73.0214157104,
+                        None,
+                        89.1910018921,
+                        99.0413742065,
+                        104.3592529297,
+                        104.3681869507,
+                        117.0548171997,
+                        None,
+                        118.9432754517,
+                        136.0619506836,
+                        None,
+                        None,
+                        None,
+                        145.9665527344,
+                        163.9772491455,
+                        169.9678497314,
+                        177.1133270264,
+                        187.9771575928,
+                        205.9878387451,
+                        210.9933166504,
+                        229.0038452148,
+                        252.0215606689,
+                        252.1087036133,
+                        252.1572875977,
+                        252.2064666748,
+                    ],
+                    [
+                        None,
+                        None,
+                        None,
+                        None,
+                        3361.7712402344,
+                        6589.943359375,
+                        6501.9853515625,
+                        4987.177734375,
+                        None,
+                        3257.0708007812,
+                        13393.138671875,
+                        3280.0544433594,
+                        4276.0112304688,
+                        57809.1875,
+                        None,
+                        4965.7436523438,
+                        648640.5625,
+                        None,
+                        None,
+                        None,
+                        11511.76171875,
+                        10362.68359375,
+                        5714.70703125,
+                        9354.2353515625,
+                        73409.0078125,
+                        257685.234375,
+                        53554.28125,
+                        193491.515625,
+                        5038.1469726562,
+                        93112.0859375,
+                        7624.11328125,
+                        4599.4125976562,
+                    ],
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    [
+                        None,
+                        50.2002449036,
+                        55.0126533508,
+                        57.0280647278,
+                        None,
+                        None,
+                        None,
+                        None,
+                        68.2973327637,
+                        69.0266494751,
+                        73.0213851929,
+                        None,
+                        74.6972732544,
+                        80.862159729,
+                        82.4692306519,
+                        85.0231246948,
+                        87.0394363403,
+                        92.4544296265,
+                        92.4610061646,
+                        104.3785171509,
+                        115.0390701294,
+                        126.1923675537,
+                        133.0496368408,
+                        136.0618743896,
+                        None,
+                        None,
+                        None,
+                        None,
+                        144.5760345459,
+                        181.1904449463,
+                        230.6756896973,
+                        268.1039733887,
+                    ],
+                    [
+                        None,
+                        87283.4296875,
+                        105163.625,
+                        246350.078125,
+                        None,
+                        None,
+                        None,
+                        None,
+                        81607.3046875,
+                        107886.640625,
+                        150512.90625,
+                        None,
+                        99324.7109375,
+                        80050.4375,
+                        108701.53125,
+                        278198.71875,
+                        95401.265625,
+                        92632.890625,
+                        111341.5625,
+                        119245.7734375,
+                        170358.671875,
+                        103961.4296875,
+                        226297.9375,
+                        48576460.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        98098.609375,
+                        100016.9296875,
+                        119618.1015625,
+                        16002674.0,
+                    ],
+                ],
+            },
+            "msv_ref_aligned": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    [
+                        57.0345,
+                        63.3177,
+                        63.3205,
+                        69.0344,
+                        None,
+                        None,
+                        71.0499,
+                        73.0292,
+                        84.9778,
+                        None,
+                        99.0447,
+                        None,
+                        None,
+                        117.055,
+                        118.059,
+                        None,
+                        136.062,
+                        137.066,
+                        236.709,
+                        253.112,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        252.109,
+                        None,
+                        None,
+                    ],
+                    [
+                        176328.0,
+                        328818.0,
+                        274432.0,
+                        197637.0,
+                        None,
+                        None,
+                        896360.0,
+                        1192020.0,
+                        378547.0,
+                        None,
+                        3921880.0,
+                        None,
+                        None,
+                        15737700.0,
+                        266131.0,
+                        None,
+                        144220000.0,
+                        3455270.0,
+                        185227.0,
+                        1284450.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        20960800.0,
+                        None,
+                        None,
+                    ],
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    [
+                        56.7603,
+                        None,
+                        None,
+                        57.0346,
+                        61.0292,
+                        61.8182,
+                        64.9491,
+                        67.9255,
+                        None,
+                        None,
+                        73.0292,
+                        82.0663,
+                        None,
+                        None,
+                        None,
+                        85.0293,
+                        None,
+                        None,
+                        None,
+                        None,
+                        115.04,
+                        None,
+                        133.05,
+                        136.062,
+                        137.067,
+                        183.555,
+                        230.198,
+                        269.108,
+                        None,
+                        None,
+                        None,
+                        268.105,
+                    ],
+                    [
+                        35523.7,
+                        None,
+                        None,
+                        184839.0,
+                        43216.2,
+                        40066.3,
+                        40362.0,
+                        41550.6,
+                        None,
+                        None,
+                        93791.1,
+                        293258.0,
+                        None,
+                        None,
+                        None,
+                        202756.0,
+                        None,
+                        None,
+                        None,
+                        None,
+                        184050.0,
+                        None,
+                        364543.0,
+                        29646700.0,
+                        830130.0,
+                        51455.4,
+                        51206.7,
+                        970064.0,
+                        None,
+                        None,
+                        None,
+                        12412800.0,
+                    ],
+                ],
+            },
+            "name": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "2'-deoxyadenosine",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "adenosine",
+            },
+            "adduct": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "[M+H]+",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "[M+H]+",
+            },
+            "inchi_key": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "OIRDTQYFTABQOQ-KQYNXXCUSA-N",
+            },
+            "precursor_mz": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.1091393,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.1040539,
+            },
+            "measured_precursor_mz": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.10887146,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.103729248,
+            },
+            "measured_precursor_intensity": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 2872807.5,
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 75979424.0,
+            },
+            "copy_index": {
+                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                    "metatlas",
+                    "c7dddd297e104ca79caea72a90150532",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
+                    2.2203779221,
+                ],
+                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                    "metatlas",
+                    "cf5e8df145f64bf0856fbf852d1bdb64",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5",
+                    3.0264527798,
+                ],
+            },
+        }
+    )
+    hits_plus.index = pd.MultiIndex.from_tuples(
+        hits_plus["copy_index"], names=["database", "id", "file_name", "msms_scan"]
+    )
+    hits_plus.drop(columns=["copy_index"], inplace=True)
+    return hits_plus
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 766934f2..a82326a7 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -5,27 +5,15 @@
 import glob
 import logging
 import os
+
 import pandas as pd
 import pytest
+
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 
 
-@pytest.fixture(scope="function", autouse=True)
-def change_test_dir(request):
-    os.chdir(request.fspath.dirname)
-    yield
-    os.chdir(request.config.invocation_dir)
-
-
-@pytest.fixture(scope="function", autouse=True)
-def set_env_vars(tmp_path, monkeypatch):
-    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
-    db_path = tmp_path / "workspace.db"
-    monkeypatch.setenv("METATLAS_SQLITE", str(db_path))
-
-
 def test_metatlas_dataset_build01(metatlas_dataset):
     assert len(metatlas_dataset) == 1
     assert len(metatlas_dataset[0]) == 1
@@ -58,8 +46,8 @@ def test_filter_compounds_ms1_notes_remove01(mocker, metatlas_dataset_with_2_cid
     assert len(metatlas_dataset[0]) == 1
 
 
-def test_filter_compounds01(mocker, metatlas_dataset_with_2_cids, compound):
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+def test_filter_compounds01(metatlas_dataset_with_2_cids):
+    # mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
     metatlas_dataset = metatlas_dataset_with_2_cids
     metatlas_dataset.filter_compounds(remove_idxs=[])
     assert len(metatlas_dataset[0]) == 2
@@ -335,17 +323,55 @@ def test_set_data01(metatlas_dataset):
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
 
 
-def test_store_atlas01(metatlas_dataset, sqlite):
+def test_store_atlas99(atlas, sqlite, username):
+    atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(atlas_list) == 0
+    metob.store(atlas)
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
+def test_store_atlas98(metatlas_dataset, atlas, sqlite, username):
+    atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(atlas_list) == 0
+    metatlas_dataset.store_atlas()
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
+def test_store_atlas97(metatlas_dataset, atlas, sqlite, username):
+    atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(atlas_list) == 0
+    metatlas_dataset.store_atlas(name='foobar', even_if_exists=True)
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
+def test_store_atlas96(metatlas_dataset, atlas, sqlite, username):
+    atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(atlas_list) == 0
+    metatlas_dataset.store_atlas(name='foobar', even_if_exists=True)
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
+def test_store_atlas01(metatlas_dataset, sqlite, username):
     metatlas_dataset.atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
+    assert len(atlas_list) == 0
     metatlas_dataset.store_atlas()
-    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username="*")
-    assert len(atlas_list) == 1
+    second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
+    assert len(second) == 1
     metatlas_dataset.store_atlas(even_if_exists=True)
     with pytest.raises(ValueError):
         metatlas_dataset.store_atlas()
 
 
-def test_analysis_identifiers01():
+def test_analysis_identifiers01(sqlite):
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
             "source_atlas_name_not_valid",
@@ -357,10 +383,10 @@ def test_analysis_identifiers01():
         )
 
 
-def test_analysis_identifiers02():
+def test_analysis_identifiers02(mocker, sqlite_with_atlas, username):
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "output_type_not_valid",
             "polarity_not_valid",
@@ -369,10 +395,10 @@ def test_analysis_identifiers02():
         )
 
 
-def test_analysis_identifiers03():
+def test_analysis_identifiers03(mocker, username):
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "polarity_not_valid",
@@ -381,11 +407,10 @@ def test_analysis_identifiers03():
         )
 
 
-def test_analysis_identifiers04(mocker):
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
+def test_analysis_identifiers04(mocker, username):
     with pytest.raises(TypeError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -394,11 +419,10 @@ def test_analysis_identifiers04(mocker):
         )
 
 
-def test_analysis_identifiers05(mocker):
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
+def test_analysis_identifiers05(mocker, username):
     with pytest.raises(TypeError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -407,11 +431,10 @@ def test_analysis_identifiers05(mocker):
         )
 
 
-def test_analysis_identifiers06(mocker):
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[True])
+def test_analysis_identifiers06(mocker, username):
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
             "FinalEMA-HILIC",
             "positive",
@@ -420,10 +443,10 @@ def test_analysis_identifiers06(mocker):
         )
 
 
-def test_analysis_identifiers07():
+def test_analysis_identifiers07(mocker, username):
     with pytest.raises(ValueError):
         mads.AnalysisIdentifiers(
-            "HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0",
+            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "experiemnt_name_not_valid",
             "output_type_not_valid",
             "polarity_not_valid",
@@ -432,14 +455,14 @@ def test_analysis_identifiers07():
         )
 
 
-def test_analysis_identifiers_atlas01(analysis_ids):
-    assert analysis_ids.atlas == "505892_OakGall_final_POS_root0"
+def test_analysis_identifiers_atlas01(analysis_ids, username):
+    assert analysis_ids.atlas == f"505892_OakGall_final_POS_{username}0"
 
 
-def test_analysis_identifiers_atlas02(analysis_ids):
+def test_analysis_identifiers_atlas02(analysis_ids, username):
     # call .atlas twice to get cached value
     analysis_ids.atlas  # pylint: disable=pointless-statement
-    assert analysis_ids.atlas == "505892_OakGall_final_POS_root0"
+    assert analysis_ids.atlas == f"505892_OakGall_final_POS_{username}0"
 
 
 def test_write_data_source_files01(metatlas_dataset, mocker, caplog):
@@ -458,7 +481,7 @@ def test_write_data_source_files02(metatlas_dataset, mocker, caplog):
     assert ma_data.make_data_sources_tables.called  # pylint: disable=no-member
 
 
-def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas):
+def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas, username):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
@@ -466,7 +489,7 @@ def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[atlas])
     mocker.patch("glob.glob", return_value=range(10))
     metatlas_dataset = mads.MetatlasDataset(analysis_ids)
-    assert metatlas_dataset.atlas.name == "505892_OakGall_final_POS_root0"
+    assert metatlas_dataset.atlas.name == f"505892_OakGall_final_POS_{username}0"
 
 
 def test_get_atlas02(mocker, analysis_ids, caplog):
@@ -501,24 +524,18 @@ def test_store_groups01(metatlas_dataset, mocker):
     assert metob.store.called  # pylint: disable=no-member
 
 
-def test_store_groups02(metatlas_dataset, mocker):
+def test_store_groups02(metatlas_dataset, mocker, username):
     def group():
         pass
 
-    group.name = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1"
+    group.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[group])
     with pytest.raises(ValueError):
         metatlas_dataset.store_groups()
 
 
-def test_annotation_gui01(metatlas_dataset, mocker):
-    mocker.patch(
-        "metatlas.plots.dill2plots.get_msms_hits",
-        return_value=pd.DataFrame(
-            {"score": [], "inchi_key": [], "measured_precursor_mz": []},
-            index=pd.MultiIndex.from_tuples([], names=["msms_scan"]),
-        ),
-    )
+def test_annotation_gui01(metatlas_dataset, hits, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     agui = metatlas_dataset.annotation_gui()
     agui.compound_idx = 0
     agui.set_msms_flag("1, co-isolated precursor but all reference ions are in sample spectrum")
@@ -534,25 +551,8 @@ def test_annotation_gui01(metatlas_dataset, mocker):
     )
 
 
-def test_generate_all_outputs01(metatlas_dataset, mocker):
-    mocker.patch(
-        "metatlas.plots.dill2plots.get_msms_hits",
-        return_value=pd.DataFrame(
-            {
-                "score": [],
-                "inchi_key": [],
-                "measured_precursor_mz": [],
-                "precursor_mz": [],
-                "file_name": [],
-                "msv_query_aligned": [],
-                "msv_ref_aligned": [],
-            },
-            index=pd.MultiIndex.from_tuples([], names=["msms_scan"]),
-        ),
-    )
+def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", hits)
     metatlas_dataset.generate_all_outputs()
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 12
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 23
-
-
-# 49, 51, 53, 55, 57, 79-80, 174, 198-214, 233-234, 576, 581, 672-682, 686-688, 691, 702-708
diff --git a/tests/unit/test_write_utils.py b/tests/unit/test_write_utils.py
index 082cf5cc..5b500781 100644
--- a/tests/unit/test_write_utils.py
+++ b/tests/unit/test_write_utils.py
@@ -7,13 +7,6 @@
 from metatlas.io import write_utils
 
 
-@pytest.fixture(scope="function", autouse=True)
-def change_test_dir(request):
-    os.chdir(request.fspath.dirname)
-    yield
-    os.chdir(request.config.invocation_dir)
-
-
 def test_make_dir_for01(mocker):
     mocker.patch("os.makedirs")
     write_utils.make_dir_for("foo/bar")
@@ -69,25 +62,22 @@ def test_raise_on_diff03(mocker):
         write_utils.raise_on_diff(to_write, "foo/bar", "test")
 
 
-def test_export_dataframe_die_on_diff01(mocker):
-    mocker.patch("os.path.exists", return_value=False)
+def test_export_dataframe_die_on_diff01():
     dataframe = pandas.DataFrame({1: [10], 2: [20]})
     write_utils.export_dataframe_die_on_diff(dataframe, "foo/bar", "test")
     # Should not raise an error. No assert needed.
 
 
-def test_export_dataframe_die_on_diff02(mocker):
-    mocker.patch("os.path.exists", return_value=True)
+def test_export_dataframe_die_on_diff02():
     dataframe = pandas.DataFrame({1: [10], 2: [20]})
-    mocker.patch("pandas.read_csv", return_value=dataframe)
+    write_utils.export_dataframe(dataframe, "foo/bar", "test")
     write_utils.export_dataframe_die_on_diff(dataframe, "foo/bar", "test")
     # Should not raise an error. No assert needed.
 
 
-def test_export_dataframe_die_on_diff03(mocker):
-    mocker.patch("os.path.exists", return_value=True)
+def test_export_dataframe_die_on_diff03():
     existing = pandas.DataFrame({1: [10], 2: [20]})
-    mocker.patch("pandas.read_csv", return_value=existing)
+    write_utils.export_dataframe(existing, "foo/bar", "test")
     to_write = pandas.DataFrame({1: [10], 2: [99]})
     with pytest.raises(ValueError):
         write_utils.export_dataframe_die_on_diff(to_write, "foo/bar", "test")

From 753483a95527f33374036f99a8c2b3621587d3b1 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 10 Jun 2021 08:55:38 -0700
Subject: [PATCH 018/177] WIP - rm METATLAS_LOCAL from noxfile.py

---
 noxfile.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 080e95de..7b3fe817 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -145,9 +145,7 @@ def blacken_nb(session):
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def unit_tests(session):
     session.install("-r", "docker/requirements.txt", *pytest_deps)
-    session.run(
-        "pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/", env={"METATLAS_LOCAL": "TRUE"}
-    )
+    session.run("pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/")
 
 
 @nox.session(python=py_versions[0], reuse_venv=REUSE_LARGE_VENV)
@@ -161,7 +159,6 @@ def cov_report(session):
         "--cov-report",
         "term-missing",
         "tests/unit/",
-        env={"METATLAS_LOCAL": "TRUE"}
     )
 
 

From 4ad1cc72e5adcaf663b89faf4f5862a4e06f9ac9 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 10 Jun 2021 08:58:14 -0700
Subject: [PATCH 019/177] WIP - add sql to make minimal test database

---
 docker/extract_test_case_from_db.sql | 154 +++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 docker/extract_test_case_from_db.sql

diff --git a/docker/extract_test_case_from_db.sql b/docker/extract_test_case_from_db.sql
new file mode 100644
index 00000000..223168e2
--- /dev/null
+++ b/docker/extract_test_case_from_db.sql
@@ -0,0 +1,154 @@
+-- ##### SECOND SETUP OF TEST DATA ######
+
+/*
+This is run against a copy of the meta_atlas mysql database.
+Usually this is done by spinning up a mysql container
+and loading an all database dump:
+
+mkdir -p dumps
+mysqldump -h nerscdb04.nersc.gov -u meta_atlas_admin -p --all-databases > dumps/meta_atlas.sql
+docker run -it --rm -e MYSQL_ROOT_PASSWORD=mypw -v $(pwd)/dumps:/docker-entrypoint-initdb.d -v $(pwd):/script mysql:5.7
+MYSQL_ID=$(docker ps | grep mysql | cut -f1 -d' ')
+docker exec -it $MYSQL_ID /bin/bash
+
+# then within the mysql container's shell:
+mysql --password=mypw meta_atlas < /script/extract_test_case_from_db.sql
+
+From within the atlasdb container
+apt-get update
+apt-get install -y python3 python3-pip
+pip3 install mysql-to-sqlite3
+mysql2sqlite -f /meta_atlas.sqlite3 -d meta_atlas -p -u root
+exit
+
+# back on the host computer you can now copy out the sqlite3 db file
+docker cp $MYSQL_ID:/meta_atlas.sqlite3 .
+*/
+
+-- results in a database with a 1 atlas, 4 lcmsruns, and 6 compounds
+-- the following tables are not modified:
+--    mzintensityhpairs
+--    fragmentationreferences_mz_intensities
+--    compoundidentifications_frag_references
+--    fragmentationreferences
+
+
+-- remove tables that are not used
+DROP TABLE IF EXISTS `Compounds`;
+DROP TABLE IF EXISTS `Group`;
+DROP TABLE IF EXISTS `group`;
+
+-- clean out tables we don't need pre-populated values in
+DELETE FROM groups;
+DELETE FROM groups_items;
+DELETE FROM methods;
+DELETE FROM samples;
+DELETE FROM mzintensitypairs;
+DELETE FROM identificationgrades;
+DELETE FROM functionalsets;
+DELETE FROM fragmentationreferences_mz_intensities;
+DELETE FROM compoundidentifications_frag_references;
+DELETE FROM fragmentationreferences;
+
+-- Modify atlas with name 'MSMLS_HILICz150mm_Annot20190824_Predicted_EMA_Unlab_POS_Shih_OakGall_505892_final_20210210.csv'
+DELETE FROM atlases
+WHERE unique_id!='4b05837a53494dd8b680e6b5059e1934';
+
+UPDATE atlases
+SET name='HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0'
+WHERE unique_id='4b05837a53494dd8b680e6b5059e1934';
+
+UPDATE atlases
+SET username='root'
+WHERE unique_id='4b05837a53494dd8b680e6b5059e1934';
+
+DELETE l
+FROM lcmsruns AS l
+LEFT JOIN (
+	SELECT unique_id
+	FROM lcmsruns AS l1
+	JOIN (
+		SELECT MAX(creation_time) AS ctime, hdf5_file
+		FROM lcmsruns
+		WHERE (name LIKE '20201106\_JGI-AK\_PS-KM\_505892\_OakGall\_final\_QE-HF\_HILICZ\_USHXG01583\_POS\_MSMS%') AND
+		      (name LIKE '%Cone-S%\_1\_%')
+		GROUP BY hdf5_file
+	) AS early
+	ON l1.creation_time=early.ctime AND l1.hdf5_file=early.hdf5_file
+	LIMIT 4
+) AS j
+ON l.unique_id=j.unique_id
+WHERE j.unique_id is NULL;
+
+DELETE FROM compounds
+WHERE chebi_id NOT IN ('CHEBI:17256', 'CHEBI:16708', 'CHEBI:16708', 'CHEBI:48517///CHEBI:17712', 'CHEBI:30959///CHEBI:17405', 'CHEBI:16335');
+
+-- work from compounds up to atlases_compound_identifications
+DELETE cic
+FROM compoundidentifications_compound AS cic
+LEFT JOIN compounds AS c
+ON cic.target_id=c.unique_id
+WHERE c.unique_id is null;
+
+DELETE ci
+FROM compoundidentifications AS ci
+LEFT JOIN compoundidentifications_compound AS cic
+ON ci.unique_id=cic.source_id
+WHERE cic.source_id is null;
+
+DELETE aci
+FROM atlases_compound_identifications AS aci
+LEFT JOIN compoundidentifications AS ci
+ON aci.target_id=ci.unique_id
+WHERE ci.unique_id is null;
+
+-- work from atlases_compound_identifications down to everything else
+DELETE atlases_compound_identifications
+FROM atlases_compound_identifications
+LEFT JOIN atlases
+ON atlases.unique_id=atlases_compound_identifications.source_id
+WHERE atlases.unique_id is null;
+
+DELETE compoundidentifications
+FROM compoundidentifications
+LEFT JOIN atlases_compound_identifications AS aci
+ON aci.target_id=compoundidentifications.unique_id
+WHERE aci.target_id is null;
+
+DELETE compoundidentifications_compound
+FROM compoundidentifications_compound
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_compound.head_id
+WHERE ci.unique_id is null;
+
+DELETE compoundidentifications_rt_references
+FROM compoundidentifications_rt_references
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_rt_references.head_id
+WHERE ci.unique_id is null;
+
+DELETE compoundidentifications_mz_references
+FROM compoundidentifications_mz_references
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_mz_references.head_id
+WHERE ci.unique_id is null;
+
+DELETE compounds
+FROM compounds
+LEFT JOIN compoundidentifications_compound AS cic
+ON compounds.head_id=cic.target_id
+WHERE cic.target_id is null;
+
+DELETE rtreferences
+FROM rtreferences
+LEFT JOIN compoundidentifications_rt_references AS cirr
+ON rtreferences.head_id=cirr.target_id
+WHERE cirr.target_id is null;
+
+DELETE mzreferences
+FROM mzreferences
+LEFT JOIN compoundidentifications_mz_references AS cimr
+ON mzreferences.head_id=cimr.target_id
+WHERE cimr.target_id is null;
+
+-- SELECT table_name, table_rows FROM information_schema.TABLES WHERE table_schema='meta_atlas' ORDER BY table_rows DESC;

From d42778f07b69632d4bd0316f1b99446cce22b505 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Sun, 13 Jun 2021 22:44:00 -0700
Subject: [PATCH 020/177] WIP - update RT saving, new hits fixture

---
 docker/local_jupyter.sh                     |   7 +-
 metatlas/datastructures/metatlas_dataset.py |  25 +-
 metatlas/datastructures/object_helpers.py   |  25 +-
 metatlas/plots/dill2plots.py                |   2 -
 noxfile.py                                  |   3 +-
 tests/unit/conftest.py                      | 935 +++++++++++++-------
 tests/unit/test_dill2plot.py                |   8 +-
 tests/unit/test_metatlas_dataset.py         |  80 +-
 8 files changed, 694 insertions(+), 391 deletions(-)

diff --git a/docker/local_jupyter.sh b/docker/local_jupyter.sh
index 23a29730..4dc0dda6 100755
--- a/docker/local_jupyter.sh
+++ b/docker/local_jupyter.sh
@@ -1,8 +1,10 @@
 #!/bin/bash
 
+set -euf -o pipefail
+
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR=$(dirname "$SCRIPT_DIR")
-OUT_DIR="$(pwd)/out"
+OUT_DIR="${SCRIPT_DIR}/out"
 IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0'
 PORT=8888
 
@@ -23,6 +25,9 @@ while [[ "$#" -gt 0 ]]; do
   shift
 done
 
+rm -rf "$OUT_DIR"
+mkdir -p "$OUT_DIR"
+
 docker run \
    --rm \
    -p "${PORT}:${PORT}" \
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 5ce1f57c..8d21539c 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -343,18 +343,17 @@ def filter_compounds_by_signal(self, num_points, peak_height, name=None):
         keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
         self.filter_compounds(keep_idxs=keep_idxs, name=name)
 
-    def store_atlas(self, name=None, even_if_exists=False):
+    def store_atlas(self, even_if_exists=False):
         """
         inputs:
-            name: name to save to database, if None then use self.atlas.name
             even_if_exists: if True, will save the atlas even if the atlas name already is in the database
                             with your username
         side effects:
             Saves the altas to the database.
             Raises ValueError if even_if_exists==False and name is already in the database with your username
         """
-        name = self.atlas.name if name is None else name
-        username = getpass.getuser()
+        name = self.atlas.name
+        username = self.ids.username
         try:
             if not even_if_exists and len(metob.retrieve("Atlas", name=name, username=username)) > 0:
                 raise ValueError(f"An atlas with name {name} and owned by {username} already exists.")
@@ -539,14 +538,18 @@ def set_rt(self, compound_idx, which, time):
             compound_idx: index of of compound to update
             which: 'rt_min', 'rt_max', or 'rt_peak'
             time: a floating point value for the number of minutes
-        updates the RT value in 3 places so that no datastructures need to be invalidated
+        updates the RT value in database, self.atlas, self.atlas_df, self.data
+        so that no datastructures need to be invalidated
         """
         assert which in ["rt_min", "rt_peak", "rt_max"]
         atlas_rt_ref = self.atlas.compound_identifications[compound_idx].rt_references[0]
         setattr(atlas_rt_ref, which, time)
-        data_rt_ref = self.data[0][compound_idx]["identification"].rt_references[0]
-        setattr(data_rt_ref, which, time)
         self.atlas_df.loc[compound_idx, which] = time
+        _ = [
+            setattr(sample[compound_idx]["identification"].rt_references[0], which, time)
+            for sample in self.data
+        ]
+        metob.store(atlas_rt_ref)
 
     def set_note(self, compound_idx, which, value):
         """
@@ -554,7 +557,8 @@ def set_note(self, compound_idx, which, value):
             compound_idx: index of of compound to update
             which: 'ms1_notes', 'ms2_notes' or 'identification_notes'
             value: a string with the note content
-        updates the RT value in 3 places so that no datastructures need to be invalidated
+        updates the notes value in database, self.atlas, self.atlas_df, self.data
+        so that no datastructures need to be invalidated
         """
         assert which in ["ms1_notes", "ms2_notes", "identification_notes"]
         atlas_cid = self.atlas.compound_identifications[compound_idx]
@@ -562,6 +566,7 @@ def set_note(self, compound_idx, which, value):
         data_cid = self.data[0][compound_idx]["identification"]
         setattr(data_cid, which, value)
         self.atlas_df.loc[compound_idx, which] = value
+        metob.store(atlas_cid)
 
     def compound_indices_marked_remove(self):
         """
@@ -823,13 +828,13 @@ def get_atlas(name, username):
     atlases = metob.retrieve("Atlas", name=name, username=username)
     try:
         if len(atlases) == 0:
-            raise ValueError(f'Database does not contain an atlas {name} owned by {username}.')
+            raise ValueError(f"Database does not contain an atlas {name} owned by {username}.")
     except ValueError as err:
         logger.exception(err)
         raise err
     try:
         if len(atlases) > 1:
-            raise ValueError(f'Database contains more than one atlas {name} owned by {username}.')
+            raise ValueError(f"Database contains more than one atlas {name} owned by {username}.")
     except ValueError as err:
         logger.exception(err)
         raise err
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index c9f2956c..7234f1ca 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -175,21 +175,6 @@ def __init__(self):
         self.seen = dict()
         Workspace.instance = self
 
-    def get_connection(self):
-        """
-        Get a re-useable connection to the database.
-
-        Each activity that queries the database needs to have this function preceeding it.
-
-        """
-        try:
-            if self.db.engine.name == 'mysql':
-                r = self.db.query('show tables')
-            else:
-                self.db.query('SELECT name FROM sqlite_master WHERE type = "table"')
-        except Exception:
-            self.db = dataset.connect(self.path)
-
     def convert_to_double(self, table, entry):
         """Convert a table column to double type."""
         with dataset.connect(self.path) as trans:
@@ -230,9 +215,6 @@ def save_objects(self, objects, _override=False):
                         self.fix_table(table_name)
                 trans[table_name].insert_many(inserts)
                 # print(table_name,inserts)
-        with dataset.connect(self.path) as trans:
-            pass
-        self.db = None
 
     def create_link_tables(self, klass):
         """
@@ -250,7 +232,6 @@ def create_link_tables(self, klass):
                                     target_id=uuid.uuid4().hex,
                                     target_table=uuid.uuid4().hex)
                         trans[table_name].insert(link)
-        self.db = None
 
     def _get_save_data(self, obj, override=False):
         """Get the data that will be used to save an object to the database"""
@@ -357,8 +338,6 @@ def retrieve(self, object_type, **kwargs):
                     clauses.append('%s like "%s"' % (key, value.replace('*', '%')))
                 else:
                     clauses.append('%s = "%s"' % (key, value))
-            if 'unique_id' not in kwargs and klass:
-                clauses.append('unique_id = head_id')
             query += ' and '.join(clauses) + ')'
             if not clauses:
                 query = query.replace(' where ()', '')
@@ -373,7 +352,7 @@ def retrieve(self, object_type, **kwargs):
                     raise(e)
             #print(query+'\n')
             # print('tables:')
-            # print([t for t in self.db.query('show tables')])
+            # print([t for t in trans.query('show tables')])
             items = [klass(**i) for i in items]
             uids = [i.unique_id for i in items]
             if not items:
@@ -471,7 +450,6 @@ def remove(self, object_type, **kwargs):
                 else:
                     raise(e)
             print('Removed')
-        self.db = None
 
     def remove_objects(self, objects, all_versions=True, **kwargs):
         """Remove a list of objects from the database."""
@@ -514,7 +492,6 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
                 query += '")'
                 trans.query(query)
             print(('Removed %s object(s)' % len(objects)))
-        self.db = None
 
 
 def format_timestamp(tstamp):
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 970d443b..24c3120f 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -575,7 +575,6 @@ def set_lin_log(self, label):
     def set_flag(self, name, value):
         logger.debug('Setting flag "%s" to "%s".', name, value)
         self.data.set_note(self.compound_idx, name, value)
-        metob.store(self.data[0][self.compound_idx]['identification'])
 
     def set_peak_flag(self, label):
         self.set_flag('ms1_notes', label)
@@ -695,7 +694,6 @@ def update_rt(self, which, val):
         line = {'rt_min': self.min_line, 'rt_peak': self.peak_line, 'rt_max': self.max_line}
         self.data.set_rt(self.compound_idx, which, val)
         slider[which].valinit = val
-        metob.store(self.data.rts[self.compound_idx])
         line[which].set_xdata((val, val))
         if which != 'rt_peak':
             self.msms_zoom_factor = 1
diff --git a/noxfile.py b/noxfile.py
index 7b3fe817..15b46ec0 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -145,7 +145,7 @@ def blacken_nb(session):
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def unit_tests(session):
     session.install("-r", "docker/requirements.txt", *pytest_deps)
-    session.run("pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/")
+    session.run("pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/", env={"METATLAS_LOCAL": "TRUE"})
 
 
 @nox.session(python=py_versions[0], reuse_venv=REUSE_LARGE_VENV)
@@ -159,6 +159,7 @@ def cov_report(session):
         "--cov-report",
         "term-missing",
         "tests/unit/",
+        env={"METATLAS_LOCAL": "TRUE"},
     )
 
 
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 122316b5..91caf508 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -2,7 +2,7 @@
 per-directory pytest configuration
 fixtures used across multiple files should go in here
 """
-# pylint: disable=missing-function-docstring,unused-argument
+# pylint: disable=missing-function-docstring,unused-argument,line-too-long,too-many-lines,too-many-arguments
 
 import getpass
 import os
@@ -12,6 +12,8 @@
 import numpy as np
 import pandas as pd
 
+from sqlalchemy.orm import close_all_sessions
+
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 
@@ -54,6 +56,8 @@ def fixture_sqlite(username):
     metob.store(metob.MzReference())
     metob.store(metob.RtReference())
     metob.store(metob.LcmsRun())
+    yield
+    close_all_sessions()
 
 
 @pytest.fixture(name="sqlite_with_atlas")
@@ -68,18 +72,13 @@ def fixture_sqlite_with_atlas_with_2_cids(sqlite, atlas_with_2_cids, username):
     metob.store(atlas_with_2_cids)
 
 
-@pytest.fixture(scope="function", autouse=True)
-def change_test_dir(request, tmp_path):
+@pytest.fixture(name="change_test_dir", scope="function", autouse=True)
+def fixture_change_test_dir(request, tmp_path):
     os.chdir(tmp_path)
     yield
     os.chdir(request.config.invocation_dir)
 
 
-@pytest.fixture(scope="function", autouse=True)
-def set_env_vars(monkeypatch):
-    monkeypatch.setenv("METATLAS_LOCAL", "TRUE")
-
-
 @pytest.fixture(name="ms1_pos")
 def fixture_ms1_pos():
     return pd.DataFrame(
@@ -520,7 +519,9 @@ def fixture_groups_controlled_vocab():
 
 
 @pytest.fixture(name="metatlas_dataset")
-def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas):
+def fixture_metatlas_dataset(
+    mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas
+):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
@@ -530,7 +531,12 @@ def fixture_metatlas_dataset(mocker, df_container, analysis_ids, groups_controll
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
 def fixture_metatlas_dataset_with_2_cids(
-    mocker, df_container, analysis_ids_with_2_cids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas_with_2_cids
+    mocker,
+    df_container,
+    analysis_ids_with_2_cids,
+    groups_controlled_vocab,
+    lcmsrun,
+    sqlite_with_atlas_with_2_cids,
 ):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
@@ -1090,7 +1096,9 @@ def fixture_group(lcmsrun, username):
     grp.creation_time = 1620146477
     grp.last_modified = 1620146477
     grp.prev_uid = "origin"
-    grp.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    grp.name = (
+        f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    )
     grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
     grp.short_name = "POS_Cone-S1"
     return grp
@@ -1106,7 +1114,9 @@ def fixture_group_with_2_lcmsruns(lcmsrun, username):
     grp.creation_time = 1620146477
     grp.last_modified = 1620146477
     grp.prev_uid = "origin"
-    grp.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    grp.name = (
+        f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    )
     grp.head_id = "61041d07b5a24ca5b88efbda8f319654"
     grp.short_name = "POS_Cone-S1"
     return grp
@@ -1114,337 +1124,648 @@ def fixture_group_with_2_lcmsruns(lcmsrun, username):
 
 @pytest.fixture(name="hits")
 def fixture_hits():
+    """
+    the 'data' parameter to pd.DataFrame is generated by:
+    1. Running the docker testing image docker/local_jupyter.sh
+    2. open /src/notebooks/reference/Targeted.ipynba
+    3. Put the following in the second code block:
+        source_atlas = 'HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0'
+        metatlas_repo_path = '/src'
+        project_directory = '/out'
+        max_cpus = 2
+        experiment = '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583'
+    4. After metatlas_dataset has been created, add a code block:
+        import json
+        import pandas as pd
+        temp_df = metatlas_dataset.hits
+        temp_df['copy_index'] = temp_df.index
+        slice_df = temp_df.loc[:,:,"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5"]
+        slice_df.index = pd.MultiIndex.from_tuples(
+            slice_df["copy_index"], names=["database", "id", "file_name", "msms_scan"]
+        )
+        parsed = json.loads(slice_df.iloc[:4].to_json())
+        print(json.dumps(parsed, indent=4, sort_keys=True).replace('null', 'np.nan'))
+    5. copy the output from the code block into 'data' parameter of the DataFrame definition below
+    """
     hits_plus = pd.DataFrame(
         data={
-            "score": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 0.7253785748,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 0.8688691781,
+            "adduct": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "[M+H]+",
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "[M+H]+",
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "[M+H]+",
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "[M+H]+",
             },
-            "num_matches": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 6,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 7,
+            "copy_index": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    "metatlas",
+                    "29247268c3cf4acfb649ebce7b0c9e0c",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+                    2.6239302158,
+                ],
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    "metatlas",
+                    "50334867a31f4cab973459a59d5731c4",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+                    2.6239302158,
+                ],
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    "metatlas",
+                    "8ba70c0f245247eeb6ba90011026763a",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+                    2.6239302158,
+                ],
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    "metatlas",
+                    "9d53a44c42004e16a468e92e2b0a7009",
+                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+                    2.6239302158,
+                ],
+            },
+            "inchi_key": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "GFFGJBXGBJISGV-UHFFFAOYSA-N",
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "GFFGJBXGBJISGV-UHFFFAOYSA-N",
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "GFFGJBXGBJISGV-UHFFFAOYSA-N",
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "GFFGJBXGBJISGV-UHFFFAOYSA-N",
+            },
+            "measured_precursor_intensity": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 1779719.0,
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 1779719.0,
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 1779719.0,
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 1779719.0,
+            },
+            "measured_precursor_mz": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.06199646,
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.06199646,
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.06199646,
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.06199646,
             },
             "msv_query_aligned": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    [
+                        np.nan,
+                        53.2601699829,
+                        59.4822044373,
+                        65.2955932617,
+                        66.7956771851,
+                        75.0065155029,
+                        75.0689544678,
+                        75.4281921387,
+                        84.2779464722,
+                        91.0504608154,
+                        94.0367355347,
+                        102.1198806763,
+                        108.4924850464,
+                        119.0352630615,
+                        121.0889511108,
+                        123.1165771484,
+                        135.7551269531,
+                        136.0224761963,
+                        136.0620117188,
+                        136.1121368408,
+                        136.3276824951,
+                        137.046295166,
+                    ],
+                    [
+                        np.nan,
+                        2901.2893066406,
+                        3058.2041015625,
+                        2817.9626464844,
+                        3278.6765136719,
+                        3068.3347167969,
+                        8541.603515625,
+                        2778.4802246094,
+                        2839.1333007812,
+                        4060.1638183594,
+                        5292.673828125,
+                        3443.1560058594,
+                        3947.8520507812,
+                        8919.974609375,
+                        5798.638671875,
+                        3330.2827148438,
+                        2859.4689941406,
+                        18918.111328125,
+                        625742.3125,
+                        91467.8984375,
+                        4438.6645507812,
+                        11957.54296875,
+                    ],
+                ],
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    [
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        53.2601699829,
+                        59.4822044373,
+                        65.2955932617,
+                        66.7956771851,
+                        75.0065155029,
+                        75.0689544678,
+                        75.4281921387,
+                        84.2779464722,
+                        91.0504608154,
+                        94.0367355347,
+                        102.1198806763,
+                        108.4924850464,
+                        119.0352630615,
+                        121.0889511108,
+                        123.1165771484,
+                        135.7551269531,
+                        136.0224761963,
+                        136.0620117188,
+                        136.1121368408,
+                        136.3276824951,
+                        137.046295166,
+                    ],
+                    [
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        2901.2893066406,
+                        3058.2041015625,
+                        2817.9626464844,
+                        3278.6765136719,
+                        3068.3347167969,
+                        8541.603515625,
+                        2778.4802246094,
+                        2839.1333007812,
+                        4060.1638183594,
+                        5292.673828125,
+                        3443.1560058594,
+                        3947.8520507812,
+                        8919.974609375,
+                        5798.638671875,
+                        3330.2827148438,
+                        2859.4689941406,
+                        18918.111328125,
+                        625742.3125,
+                        91467.8984375,
+                        4438.6645507812,
+                        11957.54296875,
+                    ],
+                ],
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
                     [
-                        None,
-                        None,
-                        None,
-                        None,
-                        56.7212257385,
-                        59.0436058044,
-                        71.0422821045,
-                        73.0214157104,
-                        None,
-                        89.1910018921,
-                        99.0413742065,
-                        104.3592529297,
-                        104.3681869507,
-                        117.0548171997,
-                        None,
-                        118.9432754517,
-                        136.0619506836,
-                        None,
-                        None,
-                        None,
-                        145.9665527344,
-                        163.9772491455,
-                        169.9678497314,
-                        177.1133270264,
-                        187.9771575928,
-                        205.9878387451,
-                        210.9933166504,
-                        229.0038452148,
-                        252.0215606689,
-                        252.1087036133,
-                        252.1572875977,
-                        252.2064666748,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        53.2601699829,
+                        59.4822044373,
+                        65.2955932617,
+                        66.7956771851,
+                        75.0065155029,
+                        75.0689544678,
+                        75.4281921387,
+                        84.2779464722,
+                        91.0504608154,
+                        94.0367355347,
+                        102.1198806763,
+                        108.4924850464,
+                        119.0352630615,
+                        121.0889511108,
+                        123.1165771484,
+                        135.7551269531,
+                        136.0224761963,
+                        136.0620117188,
+                        136.1121368408,
+                        136.3276824951,
+                        137.046295166,
                     ],
                     [
-                        None,
-                        None,
-                        None,
-                        None,
-                        3361.7712402344,
-                        6589.943359375,
-                        6501.9853515625,
-                        4987.177734375,
-                        None,
-                        3257.0708007812,
-                        13393.138671875,
-                        3280.0544433594,
-                        4276.0112304688,
-                        57809.1875,
-                        None,
-                        4965.7436523438,
-                        648640.5625,
-                        None,
-                        None,
-                        None,
-                        11511.76171875,
-                        10362.68359375,
-                        5714.70703125,
-                        9354.2353515625,
-                        73409.0078125,
-                        257685.234375,
-                        53554.28125,
-                        193491.515625,
-                        5038.1469726562,
-                        93112.0859375,
-                        7624.11328125,
-                        4599.4125976562,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        2901.2893066406,
+                        3058.2041015625,
+                        2817.9626464844,
+                        3278.6765136719,
+                        3068.3347167969,
+                        8541.603515625,
+                        2778.4802246094,
+                        2839.1333007812,
+                        4060.1638183594,
+                        5292.673828125,
+                        3443.1560058594,
+                        3947.8520507812,
+                        8919.974609375,
+                        5798.638671875,
+                        3330.2827148438,
+                        2859.4689941406,
+                        18918.111328125,
+                        625742.3125,
+                        91467.8984375,
+                        4438.6645507812,
+                        11957.54296875,
                     ],
                 ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
                     [
-                        None,
-                        50.2002449036,
-                        55.0126533508,
-                        57.0280647278,
-                        None,
-                        None,
-                        None,
-                        None,
-                        68.2973327637,
-                        69.0266494751,
-                        73.0213851929,
-                        None,
-                        74.6972732544,
-                        80.862159729,
-                        82.4692306519,
-                        85.0231246948,
-                        87.0394363403,
-                        92.4544296265,
-                        92.4610061646,
-                        104.3785171509,
-                        115.0390701294,
-                        126.1923675537,
-                        133.0496368408,
-                        136.0618743896,
-                        None,
-                        None,
-                        None,
-                        None,
-                        144.5760345459,
-                        181.1904449463,
-                        230.6756896973,
-                        268.1039733887,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        53.2601699829,
+                        59.4822044373,
+                        65.2955932617,
+                        66.7956771851,
+                        75.0065155029,
+                        75.0689544678,
+                        75.4281921387,
+                        84.2779464722,
+                        91.0504608154,
+                        94.0367355347,
+                        102.1198806763,
+                        108.4924850464,
+                        119.0352630615,
+                        121.0889511108,
+                        123.1165771484,
+                        135.7551269531,
+                        136.0224761963,
+                        136.0620117188,
+                        136.1121368408,
+                        136.3276824951,
+                        137.046295166,
                     ],
                     [
-                        None,
-                        87283.4296875,
-                        105163.625,
-                        246350.078125,
-                        None,
-                        None,
-                        None,
-                        None,
-                        81607.3046875,
-                        107886.640625,
-                        150512.90625,
-                        None,
-                        99324.7109375,
-                        80050.4375,
-                        108701.53125,
-                        278198.71875,
-                        95401.265625,
-                        92632.890625,
-                        111341.5625,
-                        119245.7734375,
-                        170358.671875,
-                        103961.4296875,
-                        226297.9375,
-                        48576460.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        98098.609375,
-                        100016.9296875,
-                        119618.1015625,
-                        16002674.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        2901.2893066406,
+                        3058.2041015625,
+                        2817.9626464844,
+                        3278.6765136719,
+                        3068.3347167969,
+                        8541.603515625,
+                        2778.4802246094,
+                        2839.1333007812,
+                        4060.1638183594,
+                        5292.673828125,
+                        3443.1560058594,
+                        3947.8520507812,
+                        8919.974609375,
+                        5798.638671875,
+                        3330.2827148438,
+                        2859.4689941406,
+                        18918.111328125,
+                        625742.3125,
+                        91467.8984375,
+                        4438.6645507812,
+                        11957.54296875,
                     ],
                 ],
             },
             "msv_ref_aligned": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
                     [
-                        57.0345,
-                        63.3177,
-                        63.3205,
-                        69.0344,
-                        None,
-                        None,
-                        71.0499,
-                        73.0292,
-                        84.9778,
-                        None,
-                        99.0447,
-                        None,
-                        None,
-                        117.055,
-                        118.059,
-                        None,
+                        51.3947,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        91.0548,
+                        94.0404,
+                        np.nan,
+                        np.nan,
+                        119.035,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        136.022,
                         136.062,
-                        137.066,
-                        236.709,
-                        253.112,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        252.109,
-                        None,
-                        None,
+                        136.112,
+                        np.nan,
+                        137.046,
                     ],
                     [
-                        176328.0,
-                        328818.0,
-                        274432.0,
-                        197637.0,
-                        None,
-                        None,
-                        896360.0,
-                        1192020.0,
-                        378547.0,
-                        None,
-                        3921880.0,
-                        None,
-                        None,
-                        15737700.0,
-                        266131.0,
-                        None,
-                        144220000.0,
-                        3455270.0,
-                        185227.0,
-                        1284450.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        20960800.0,
-                        None,
-                        None,
+                        1870.1,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        3051.11,
+                        13543.2,
+                        np.nan,
+                        np.nan,
+                        28284.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        55585.3,
+                        1607820.0,
+                        17469.6,
+                        np.nan,
+                        43758.8,
                     ],
                 ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
                     [
-                        56.7603,
-                        None,
-                        None,
-                        57.0346,
-                        61.0292,
-                        61.8182,
-                        64.9491,
-                        67.9255,
-                        None,
-                        None,
-                        73.0292,
-                        82.0663,
-                        None,
-                        None,
-                        None,
-                        85.0293,
-                        None,
-                        None,
-                        None,
-                        None,
-                        115.04,
-                        None,
-                        133.05,
+                        52.1001,
+                        53.5537,
+                        54.6096,
+                        57.8238,
+                        63.3067,
+                        64.108,
+                        82.7587,
+                        93.0862,
+                        94.6115,
+                        111.471,
+                        113.584,
+                        115.21,
+                        137.067,
+                        137.476,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        94.0407,
+                        np.nan,
+                        np.nan,
+                        119.036,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
                         136.062,
+                        np.nan,
+                        np.nan,
+                        137.046,
+                    ],
+                    [
+                        491091.0,
+                        614205.0,
+                        486992.0,
+                        569335.0,
+                        2513570.0,
+                        554436.0,
+                        577010.0,
+                        580100.0,
+                        930338.0,
+                        567270.0,
+                        515519.0,
+                        616418.0,
+                        17234000.0,
+                        693366.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        2437690.0,
+                        np.nan,
+                        np.nan,
+                        7680000.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        514804000.0,
+                        np.nan,
+                        np.nan,
+                        4940020.0,
+                    ],
+                ],
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    [
+                        59.3596,
+                        62.4513,
+                        63.2027,
+                        76.4601,
+                        86.8208,
+                        115.912,
+                        115.975,
+                        123.375,
                         137.067,
-                        183.555,
-                        230.198,
-                        269.108,
-                        None,
-                        None,
-                        None,
-                        268.105,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        94.0407,
+                        np.nan,
+                        np.nan,
+                        119.036,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        136.062,
+                        np.nan,
+                        np.nan,
+                        137.046,
                     ],
                     [
-                        35523.7,
-                        None,
-                        None,
-                        184839.0,
-                        43216.2,
-                        40066.3,
-                        40362.0,
-                        41550.6,
-                        None,
-                        None,
-                        93791.1,
-                        293258.0,
-                        None,
-                        None,
-                        None,
-                        202756.0,
-                        None,
-                        None,
-                        None,
-                        None,
-                        184050.0,
-                        None,
-                        364543.0,
-                        29646700.0,
-                        830130.0,
-                        51455.4,
-                        51206.7,
-                        970064.0,
-                        None,
-                        None,
-                        None,
-                        12412800.0,
+                        55769.1,
+                        43616.3,
+                        118692.0,
+                        54358.0,
+                        48393.1,
+                        45996.2,
+                        55157.9,
+                        61623.1,
+                        1357390.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        121260.0,
+                        np.nan,
+                        np.nan,
+                        306316.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        41864400.0,
+                        np.nan,
+                        np.nan,
+                        370525.0,
+                    ],
+                ],
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": [
+                    [
+                        55.0301,
+                        56.3854,
+                        66.7513,
+                        67.0298,
+                        81.1529,
+                        82.4076,
+                        92.0251,
+                        92.3892,
+                        104.302,
+                        109.051,
+                        112.051,
+                        135.054,
+                        135.653,
+                        136.227,
+                        136.474,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        94.0405,
+                        np.nan,
+                        np.nan,
+                        119.035,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        136.062,
+                        np.nan,
+                        np.nan,
+                        137.046,
+                    ],
+                    [
+                        246689.0,
+                        186484.0,
+                        198526.0,
+                        974057.0,
+                        232546.0,
+                        306008.0,
+                        388476.0,
+                        265393.0,
+                        246201.0,
+                        1625240.0,
+                        1318880.0,
+                        345780.0,
+                        925801.0,
+                        254046.0,
+                        715569.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        7436560.0,
+                        np.nan,
+                        np.nan,
+                        23732500.0,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        np.nan,
+                        884493000.0,
+                        np.nan,
+                        np.nan,
+                        23845700.0,
                     ],
                 ],
             },
             "name": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "2'-deoxyadenosine",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "adenosine",
-            },
-            "adduct": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "[M+H]+",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "[M+H]+",
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "adenine",
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "adenine",
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "adenine",
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": "adenine",
             },
-            "inchi_key": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": "OIRDTQYFTABQOQ-KQYNXXCUSA-N",
+            "num_matches": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 7,
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 4,
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 4,
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 4,
             },
             "precursor_mz": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.1091393,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.1040539,
-            },
-            "measured_precursor_mz": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 252.10887146,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 268.103729248,
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.0617952,
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.0617952,
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.0617952,
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 136.0617952,
             },
-            "measured_precursor_intensity": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": 2872807.5,
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": 75979424.0,
-            },
-            "copy_index": {
-                "('metatlas', 'c7dddd297e104ca79caea72a90150532', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5', 2.2203779220581055)": [
-                    "metatlas",
-                    "c7dddd297e104ca79caea72a90150532",
-                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
-                    2.2203779221,
-                ],
-                "('metatlas', 'cf5e8df145f64bf0856fbf852d1bdb64', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5', 3.0264527797698975)": [
-                    "metatlas",
-                    "cf5e8df145f64bf0856fbf852d1bdb64",
-                    "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5",
-                    3.0264527798,
-                ],
+            "score": {
+                "('metatlas', '29247268c3cf4acfb649ebce7b0c9e0c', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 0.7861480398,
+                "('metatlas', '50334867a31f4cab973459a59d5731c4', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 0.8248297009,
+                "('metatlas', '8ba70c0f245247eeb6ba90011026763a', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 0.8078499983,
+                "('metatlas', '9d53a44c42004e16a468e92e2b0a7009', '20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5', 2.6239302158355713)": 0.8274397807,
             },
         }
     )
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index e6e0f535..6aee1a09 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -76,9 +76,13 @@ def test_remove_metatlas_objects_by_list_remove_all():
     assert [] == dill2plots.remove_metatlas_objects_by_list([i, j], "myattr", [0, 2, 5])
 
 
-def test_export_atlas_to_spreadsheet(atlas):
+def test_export_atlas_to_spreadsheet(atlas, username):
     # pylint: disable=line-too-long
-    expected = """{"chebi_id":{"0":"CHEBI:17256"},"chebi_url":{"0":"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"},"creation_time":{"0":1466212395.0},"description":{"0":"A purine 2'-deoxyribonucleoside having adenine as the nucleobase."},"formula":{"0":"C10H13N5O3"},"head_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"hmdb_id":{"0":"HMDB00101"},"hmdb_url":{"0":"http://www.hmdb.ca/metabolites/HMDB00101"},"img_abc_id":{"0":""},"inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"iupac_name":{"0":""},"kegg_id":{"0":"C00559"},"kegg_url":{"0":"http://www.genome.jp/dbget-bin/www_bget?C00559"},"last_modified":{"0":1612996604.0},"lipidmaps_id":{"0":""},"lipidmaps_url":{"0":""},"metacyc_id":{"0":"DEOXYADENOSINE"},"mono_isotopic_molecular_weight":{"0":251.101839276},"name":{"0":"2'-deoxyadenosine"},"neutralized_2d_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"},"neutralized_2d_inchi_key":{"0":"OLXZPDWKRNYJJZ-UHFFFAOYSA-N"},"neutralized_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"neutralized_inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"num_free_radicals":{"0":0.0},"number_components":{"0":1.0},"permanent_charge":{"0":0.0},"prev_uid":{"0":"origin"},"pubchem_compound_id":{"0":"13730"},"pubchem_url":{"0":"http://pubchem.ncbi.nlm.nih.gov/compound/13730"},"source":{"0":"gnps///chebi///metacyc///hmdb"},"synonyms":{"0":"2'-deoxyadenosine"},"unique_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"username":{"0":"wjholtz"},"wikipedia_url":{"0":""},"label":{"0":"2'-deoxyadenosine"},"id_notes":{"0":"No description"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"bad match to ref"},"identification_notes":{"0":"my id note"},"rt_min":{"0":1.6964640054},"rt_max":{"0":2.6964640054},"rt_peak":{"0":2.1964640054},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"adduct":{"0":"[M+H]+"},"polarity":{"0":"positive"}}"""  # noqa:  E501
+    expected = (
+        """{"chebi_id":{"0":"CHEBI:17256"},"chebi_url":{"0":"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"},"creation_time":{"0":1466212395.0},"description":{"0":"A purine 2'-deoxyribonucleoside having adenine as the nucleobase."},"formula":{"0":"C10H13N5O3"},"head_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"hmdb_id":{"0":"HMDB00101"},"hmdb_url":{"0":"http://www.hmdb.ca/metabolites/HMDB00101"},"img_abc_id":{"0":""},"inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"iupac_name":{"0":""},"kegg_id":{"0":"C00559"},"kegg_url":{"0":"http://www.genome.jp/dbget-bin/www_bget?C00559"},"last_modified":{"0":1612996604.0},"lipidmaps_id":{"0":""},"lipidmaps_url":{"0":""},"metacyc_id":{"0":"DEOXYADENOSINE"},"mono_isotopic_molecular_weight":{"0":251.101839276},"name":{"0":"2'-deoxyadenosine"},"neutralized_2d_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"},"neutralized_2d_inchi_key":{"0":"OLXZPDWKRNYJJZ-UHFFFAOYSA-N"},"neutralized_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"neutralized_inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"num_free_radicals":{"0":0.0},"number_components":{"0":1.0},"permanent_charge":{"0":0.0},"prev_uid":{"0":"origin"},"pubchem_compound_id":{"0":"13730"},"pubchem_url":{"0":"http://pubchem.ncbi.nlm.nih.gov/compound/13730"},"source":{"0":"gnps///chebi///metacyc///hmdb"},"synonyms":{"0":"2'-deoxyadenosine"},"unique_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"username":{"0":"""  # noqa:  E501
+        f'"{username}"'
+        """},"wikipedia_url":{"0":""},"label":{"0":"2'-deoxyadenosine"},"id_notes":{"0":"No description"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"bad match to ref"},"identification_notes":{"0":"my id note"},"rt_min":{"0":1.6964640054},"rt_max":{"0":2.6964640054},"rt_peak":{"0":2.1964640054},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"adduct":{"0":"[M+H]+"},"polarity":{"0":"positive"}}"""
+    )  # noqa:  E501
     assert expected == dill2plots.export_atlas_to_spreadsheet(atlas).to_json().replace(r"\/", "/")
 
 
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index a82326a7..72796f43 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -1,10 +1,9 @@
 """ tests for MetatasDataset """
-# pylint: disable=missing-function-docstring,protected-access,unused-argument
+# pylint: disable=missing-function-docstring,protected-access,unused-argument,too-many-arguments
 
 import datetime
 import glob
 import logging
-import os
 
 import pandas as pd
 import pytest
@@ -119,11 +118,10 @@ def test_rts02(metatlas_dataset):
     assert len(metatlas_dataset.rts) == 1
 
 
-def test_rts03(metatlas_dataset, analysis_ids, sqlite):
+def test_rts03(metatlas_dataset, analysis_ids):
+    assert metatlas_dataset.rts[0].rt_max != 9.99
     metatlas_dataset.set_rt(0, "rt_max", 9.99)
-    metob.store(metatlas_dataset.atlas)
-    atlas_from_db = metob.retrieve("Atlas", unique_id=metatlas_dataset.atlas.unique_id)[0]
-    second_metatlas_dataset = mads.MetatlasDataset(analysis_ids, atlas_from_db)
+    second_metatlas_dataset = mads.MetatlasDataset(analysis_ids)
     assert second_metatlas_dataset.rts[0].rt_max == 9.99
     assert len(second_metatlas_dataset.rts) == 1
 
@@ -323,7 +321,7 @@ def test_set_data01(metatlas_dataset):
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
 
 
-def test_store_atlas99(atlas, sqlite, username):
+def test_store_atlas01(atlas, sqlite, username):
     atlas.name = "test_store_atlas01"
     atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
     assert len(atlas_list) == 0
@@ -332,34 +330,24 @@ def test_store_atlas99(atlas, sqlite, username):
     assert len(second) == 1
 
 
-def test_store_atlas98(metatlas_dataset, atlas, sqlite, username):
-    atlas.name = "test_store_atlas01"
-    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+def test_store_atlas02(metatlas_dataset, username):
+    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(atlas_list) == 0
     metatlas_dataset.store_atlas()
-    second = metob.retrieve("Atlas", name=atlas.name, username=username)
-    assert len(second) == 1
-
-
-def test_store_atlas97(metatlas_dataset, atlas, sqlite, username):
-    atlas.name = "test_store_atlas01"
-    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
-    assert len(atlas_list) == 0
-    metatlas_dataset.store_atlas(name='foobar', even_if_exists=True)
-    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(second) == 1
 
 
-def test_store_atlas96(metatlas_dataset, atlas, sqlite, username):
-    atlas.name = "test_store_atlas01"
-    atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
+def test_store_atlas03(metatlas_dataset, atlas, sqlite, username):
+    metatlas_dataset.atlas.name = "test_store_atlas01"
+    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(atlas_list) == 0
-    metatlas_dataset.store_atlas(name='foobar', even_if_exists=True)
-    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    metatlas_dataset.store_atlas()
+    second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(second) == 1
 
 
-def test_store_atlas01(metatlas_dataset, sqlite, username):
+def test_store_atlas04(metatlas_dataset, sqlite, username):
     metatlas_dataset.atlas.name = "test_store_atlas01"
     atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(atlas_list) == 0
@@ -372,7 +360,7 @@ def test_store_atlas01(metatlas_dataset, sqlite, username):
 
 
 def test_analysis_identifiers01(sqlite):
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=r"Database does not contain an atlas.*"):
         mads.AnalysisIdentifiers(
             "source_atlas_name_not_valid",
             "experiment_not_valid",
@@ -383,8 +371,10 @@ def test_analysis_identifiers01(sqlite):
         )
 
 
-def test_analysis_identifiers02(mocker, sqlite_with_atlas, username):
-    with pytest.raises(ValueError):
+def test_analysis_identifiers02(sqlite_with_atlas, username):
+    with pytest.raises(
+        ValueError, match='Parameter output_type is not one of "ISTDsEtc" or "FinalEMA-HILIC".'
+    ):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -395,8 +385,8 @@ def test_analysis_identifiers02(mocker, sqlite_with_atlas, username):
         )
 
 
-def test_analysis_identifiers03(mocker, username):
-    with pytest.raises(ValueError):
+def test_analysis_identifiers03(username, sqlite_with_atlas):
+    with pytest.raises(ValueError, match='Parameter polarity is not one of "positive" or "negative".'):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -407,8 +397,8 @@ def test_analysis_identifiers03(mocker, username):
         )
 
 
-def test_analysis_identifiers04(mocker, username):
-    with pytest.raises(TypeError):
+def test_analysis_identifiers04(username, sqlite_with_atlas):
+    with pytest.raises(TypeError, match="Parameter analysis_number is not an integer."):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -419,8 +409,8 @@ def test_analysis_identifiers04(mocker, username):
         )
 
 
-def test_analysis_identifiers05(mocker, username):
-    with pytest.raises(TypeError):
+def test_analysis_identifiers05(username, sqlite_with_atlas):
+    with pytest.raises(TypeError, match="Parameter analysis_number is not an integer."):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -431,8 +421,8 @@ def test_analysis_identifiers05(mocker, username):
         )
 
 
-def test_analysis_identifiers06(mocker, username):
-    with pytest.raises(ValueError):
+def test_analysis_identifiers06(username, sqlite_with_atlas):
+    with pytest.raises(ValueError, match="Parameter analysis_number cannot be negative."):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -443,11 +433,11 @@ def test_analysis_identifiers06(mocker, username):
         )
 
 
-def test_analysis_identifiers07(mocker, username):
-    with pytest.raises(ValueError):
+def test_analysis_identifiers07(username, sqlite_with_atlas):
+    with pytest.raises(ValueError, match='Parameter experiment does contain 9 fields when split on "_".'):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "experiemnt_name_not_valid",
+            "experiment_name_not_valid",
             "output_type_not_valid",
             "polarity_not_valid",
             "analysis_number_not_valid",
@@ -497,14 +487,14 @@ def test_get_atlas02(mocker, analysis_ids, caplog):
     caplog.set_level(logging.INFO)
     with pytest.raises(ValueError):
         mads.MetatlasDataset(analysis_ids)
-    assert "Database does not contain an atlas named" in caplog.text
+    assert "Database does not contain an atlas" in caplog.text
 
 
 def test_get_atlas03(mocker, analysis_ids, caplog):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0, 0])
     with pytest.raises(ValueError):
         mads.MetatlasDataset(analysis_ids)
-    assert "Database contains more than one atlas named" in caplog.text
+    assert "Database contains more than one atlas" in caplog.text
 
 
 def test_existing_groups(mocker, metatlas_dataset):
@@ -528,7 +518,9 @@ def test_store_groups02(metatlas_dataset, mocker, username):
     def group():
         pass
 
-    group.name = f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    group.name = (
+        f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1"
+    )
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[group])
     with pytest.raises(ValueError):
         metatlas_dataset.store_groups()
@@ -552,7 +544,7 @@ def test_annotation_gui01(metatlas_dataset, hits, mocker):
 
 
 def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
-    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", hits)
+    mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.generate_all_outputs()
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 12
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 23

From 822a9fdafdeb94ae94a7191023fcc3df3d624a80 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 15 Jun 2021 10:44:40 -0700
Subject: [PATCH 021/177] WIP - improved handling of source atlas

SQL logging
remove some python2 compatibility code
---
 metatlas/datastructures/metatlas_dataset.py | 44 ++++++++--
 metatlas/datastructures/object_helpers.py   | 32 +------
 metatlas/tools/logging.py                   | 97 +++++++++++++++------
 metatlas/tools/notebook.py                  | 15 ++++
 tests/unit/test_metatlas_dataset.py         | 31 +++++--
 5 files changed, 150 insertions(+), 69 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 8d21539c..8ba2e3b6 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -215,10 +215,35 @@ def write_lcmsruns_short_names(self):
 
     def _get_atlas(self):
         """Copy source atlas from database into current analysis atlas"""
-        source = get_atlas(self.ids.source_atlas, self.ids.username)
-        self._atlas = source.clone()
-        self._atlas.name = self.ids.atlas
-        self._atlas_valid = True
+        atlases = metob.retrieve("Atlas", name=self.ids.atlas, username=self.ids.username)
+        if len(atlases) == 1:
+            logger.warning(
+                (
+                    "Destination atlas, %s, already exists, so not copying source atlas, "
+                    "%s, to destination. Not overwriting."
+                ),
+                self.ids.atlas,
+                self.ids.source_atlas,
+            )
+            self._atlas = atlases[0]
+            self._atlas_valid = True
+        elif len(atlases) > 1:
+            try:
+                raise ValueError(
+                    (
+                        f"{len(atlases)} atlases with name {self.ids.atlas} "
+                        f"and owned by {self.ids.username} already exist."
+                    )
+                )
+            except ValueError as err:
+                logger.exception(err)
+                raise err
+        else:
+            source = get_atlas(self.ids.source_atlas, self.ids.username)
+            self._atlas = source.clone()
+            self._atlas.name = self.ids.atlas
+            self._atlas_valid = True
+            self.store_atlas()
 
     def _build(self):
         """Populate self._data from database and h5 files."""
@@ -361,6 +386,7 @@ def store_atlas(self, even_if_exists=False):
             logger.exception(err)
             raise err
         metob.store(self.atlas)
+        logger.info("Atlas %s stored in database with owner %s.", self.ids.atlas, self.ids.username)
 
     def export_atlas_to_csv(self, filename=None):
         """
@@ -544,11 +570,9 @@ def set_rt(self, compound_idx, which, time):
         assert which in ["rt_min", "rt_peak", "rt_max"]
         atlas_rt_ref = self.atlas.compound_identifications[compound_idx].rt_references[0]
         setattr(atlas_rt_ref, which, time)
-        self.atlas_df.loc[compound_idx, which] = time
-        _ = [
+        for sample in self.data:
             setattr(sample[compound_idx]["identification"].rt_references[0], which, time)
-            for sample in self.data
-        ]
+        self.atlas_df.loc[compound_idx, which] = time
         metob.store(atlas_rt_ref)
 
     def set_note(self, compound_idx, which, value):
@@ -617,6 +641,8 @@ def get_lcmsruns_short_names(self, fields=None):
             for name, idxs in fields.items():
                 out.loc[i, name] = "_".join([tokens[n] for n in idxs])
             out.loc[i, "last_modified"] = pd.to_datetime(lcms_file.last_modified, unit="s")
+        if out.empty:
+            return out
         out.sort_values(by="last_modified", inplace=True)
         out.drop(columns=["last_modified"], inplace=True)
         out.drop_duplicates(subset=["full_filename"], keep="last", inplace=True)
@@ -655,6 +681,8 @@ def _files_dict(self):
     def groups_dataframe(self):
         """Returns pandas Dataframe with one group per row"""
         out = pd.DataFrame(self._files_dict).T
+        if out.empty:
+            return out
         out.drop(columns=["object"], inplace=True)
         out.index.name = "filename"
         return out.reset_index()
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 7234f1ca..ae0c816a 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -57,14 +57,6 @@ class NotifyList(list):
     __iadd__ = callback_method(list.__iadd__)
     __imul__ = callback_method(list.__imul__)
 
-    # Take care to return a new NotifyList if we slice it.
-    if sys.version_info[0] < 3:
-        __setslice__ = callback_method(list.__setslice__)
-        __delslice__ = callback_method(list.__delslice__)
-
-        def __getslice__(self, *args):
-            return self.__class__(list.__getslice__(self, *args))
-
     def __getitem__(self, item):
         if isinstance(item, slice):
             return self.__class__(list.__getitem__(self, item))
@@ -129,10 +121,6 @@ def __init__(self):
         # from other locations
         # this directory contains the config files
         metatlas_dir = os.path.dirname(sys.modules[self.__class__.__module__].__file__)
-        #print("Metatlas live in ", metatlas_dir)
-
-        host_name = socket.gethostname()
-        #print("asdf you're running on %s at %s " % (host_name, socket.gethostbyname(socket.gethostname())))
         if ON_NERSC:
             with open(os.path.join(metatlas_dir, 'nersc_config', 'nersc.yml')) as fid:
                 nersc_info = yaml.safe_load(fid)
@@ -214,7 +202,6 @@ def save_objects(self, objects, _override=False):
                     if 'sqlite' not in self.path:
                         self.fix_table(table_name)
                 trans[table_name].insert_many(inserts)
-                # print(table_name,inserts)
 
     def create_link_tables(self, klass):
         """
@@ -350,9 +337,6 @@ def retrieve(self, object_type, **kwargs):
                     raise ValueError('Invalid column name, valid columns: %s' % keys)
                 else:
                     raise(e)
-            #print(query+'\n')
-            # print('tables:')
-            # print([t for t in trans.query('show tables')])
             items = [klass(**i) for i in items]
             uids = [i.unique_id for i in items]
             if not items:
@@ -367,7 +351,6 @@ def retrieve(self, object_type, **kwargs):
                         continue
                     querystr = 'select * from `%s` where source_id in ("' % table_name
                     querystr += '" , "'.join(uids)
-                    #print(querystr+'\n')
                     result = trans.query(querystr + '")')
                     sublist = defaultdict(list)
                     for r in result:
@@ -391,10 +374,7 @@ def remove(self, object_type, **kwargs):
         override = kwargs.pop('_override', False)
         if not override:
             msg = 'Are you sure you want to delete the entries? (Y/N)'
-            if sys.version.startswith('2'):
-                ans = input(msg)
-            else:
-                ans = eval(input(msg))
+            ans = eval(input(msg))
             if not ans[0].lower().startswith('y'):
                 print('Aborting')
                 return
@@ -462,10 +442,7 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
         if not override:
             msg = ('Are you sure you want to delete the %s object(s)? (Y/N)'
                    % len(objects))
-            if sys.version.startswith('2'):
-                ans = input(msg)
-            else:
-                ans = eval(input(msg))
+            ans = eval(input(msg))
             if not ans[0].lower().startswith('y'):
                 print('Aborting')
                 return
@@ -590,10 +567,7 @@ def get_from_nersc(user, relative_path):
     print(cmd)
     proc = pexpect.spawn(cmd)
     proc.expect("assword:*")
-    if sys.version.startswith('3'):
-        passwd = eval(input())
-    else:
-        passwd = input()
+    passwd = eval(input())
     clear_output()
     proc.send(passwd)
     proc.send('\r')
diff --git a/metatlas/tools/logging.py b/metatlas/tools/logging.py
index 11cc098b..6f10a5ce 100644
--- a/metatlas/tools/logging.py
+++ b/metatlas/tools/logging.py
@@ -47,9 +47,12 @@ def format(self, record) -> str:
         return super().format(record)
 
 
-def activate_logging(console_level="INFO", console_format=None, file_level="DEBUG", filename=None):
+def activate_module_logging(
+    module, console_level="INFO", console_format=None, file_level="DEBUG", filename=None
+):
     """
     inputs:
+        module: name of logger to capture messages from, often a module name
         console_level: string with desired logging level for messages on stdout (notebook)
         file_level: string with desired logging level for message to log file
         filename: file to send logs to
@@ -58,10 +61,59 @@ def activate_logging(console_level="INFO", console_format=None, file_level="DEBU
     Call this function to activate logging to console and file
     valid logging levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
     """
-    if console_format is None:
-        console_format = "{asctime} {color}{levelname:8}{reset} {message}"
+    console_handler = get_console_handler(console_level, console_format)
+    file_handler = get_file_handler(file_level, filename)
+
+    logger = logging.getLogger(module)
+    logger.handlers[:] = []
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    logger.setLevel(
+        levels[file_level] if levels[file_level] < levels[console_level] else levels[console_level]
+    )
+    return logger
+
+
+def disable_jupyter_default_logging():
+    """
+    stop jupyter from making its own root-level logger
+     note that jupyter delays creating the root-level logger until a log message is generated
+    """
+    jupyter_logger = logging.getLogger()
+    jupyter_logger.handlers[:] = []
+    jupyter_logger.addHandler(logging.NullHandler())
+
+
+def get_file_handler(level, filename=None):
+    """
+    inputs:
+        level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+        filename: logging destination
+    Returns a logging.FileHandler object
+    """
+    if filename is None:
+        if "METATLAS_LOCAL" in os.environ:
+            filename = "metatlas.log"
+        else:
+            filename = f"/global/cfs/projectdirs/m2650/jupyter_logs/{getpass.getuser()}.log"
+    file_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(name)s;%(message)s")
+    file_handler = logging.FileHandler(filename)
+    file_handler.setFormatter(file_formatter)
+    file_handler.setLevel(levels[level])
+    return file_handler
+
+
+def get_console_handler(level, format_str=None):
+    """
+    inputs:
+        level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+        format_str: input to logging.setFormatter
+    Returns a logging.StreamHandler object
+    """
+    if format_str is None:
+        format_str = "{asctime} {color}{levelname:8}{reset} {message}"
     console_formatter = ColoredFormatter(
-        console_format,
+        format_str,
         style="{",
         datefmt="%Y-%m-%d %H:%M:%S",
         colors={
@@ -74,29 +126,20 @@ def activate_logging(console_level="INFO", console_format=None, file_level="DEBU
     )
     console_handler = logging.StreamHandler(sys.stdout)
     console_handler.setFormatter(console_formatter)
-    console_handler.setLevel(levels[console_level])
+    console_handler.setLevel(levels[level])
+    return console_handler
 
-    if filename is None:
-        if "METATLAS_LOCAL" in os.environ:
-            filename = "metatlas.log"
-        else:
-            filename = f"/global/cfs/projectdirs/m2650/jupyter_logs/{getpass.getuser()}.log"
-    file_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(name)s;%(message)s")
-    file_handler = logging.FileHandler(filename)
-    file_handler.setFormatter(file_formatter)
-    file_handler.setLevel(levels[file_level])
 
-    # stop jupyter from making its own root-level logger
-    # note that jupyter delays creating the root-level logger until a log message is generated
-    jupyter_logger = logging.getLogger()
-    jupyter_logger.handlers[:] = []
-    jupyter_logger.addHandler(logging.NullHandler())
+def activate_logging(console_level="INFO", console_format=None, file_level="DEBUG", filename=None):
+    """
+    inputs:
+        console_level: string with desired logging level for messages on stdout (notebook)
+        file_level: string with desired logging level for message to log file
+        filename: file to send logs to
+    returns logger
 
-    logger = logging.getLogger("metatlas")
-    logger.handlers[:] = []
-    logger.addHandler(console_handler)
-    logger.addHandler(file_handler)
-    logger.setLevel(
-        levels[file_level] if levels[file_level] < levels[console_level] else levels[console_level]
-    )
-    return logger
+    Call this function to activate logging to console and file
+    valid logging levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    """
+    disable_jupyter_default_logging()
+    activate_module_logging("metatlas", console_level, console_format, file_level, filename)
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index e3c5f8b9..d9a8c21d 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -9,6 +9,7 @@
 import pandas as pd
 from IPython.core.display import display, HTML
 from metatlas.tools.logging import activate_logging
+from metatlas.tools.logging import activate_module_logging
 
 logger = logging.getLogger(__name__)
 
@@ -86,3 +87,17 @@ def setup(log_level):
     validate_kernel()
     configure_notebook_display()
     configure_pandas_display()
+
+
+def activate_sql_logging(console_level="INFO", console_format=None, file_level="DEBUG", filename=None):
+    """
+    Turns on logging from sqlalchemy.
+    Level 'INFO' gets SQL statements and 'DEBUG' gets SQL statements and results.
+    inputs:
+        console_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+        console_format: input to logging.setFormatter
+        file_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+        filename: logging destination
+
+    """
+    activate_module_logging("sqlalchemy.engine", console_level, console_format, file_level, filename)
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 72796f43..1973b1cf 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -126,6 +126,20 @@ def test_rts03(metatlas_dataset, analysis_ids):
     assert len(second_metatlas_dataset.rts) == 1
 
 
+def test_rts04(analysis_ids, sqlite_with_atlas, mocker, lcmsrun, df_container):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    first = mads.MetatlasDataset(analysis_ids)
+    first.set_rt(0, "rt_max", 1.11)
+    second = mads.MetatlasDataset(analysis_ids)
+    assert second.rts[0].rt_max == 1.11
+    second.set_rt(0, "rt_max", 2.22)
+    third = mads.MetatlasDataset(analysis_ids)
+    assert third.rts[0].rt_max == 2.22
+
+
 def test_set_note01(metatlas_dataset, sqlite):
     metatlas_dataset.set_note(0, "ms2_notes", "Foobar")
     assert metatlas_dataset[0][0]["identification"].ms2_notes == "Foobar"
@@ -332,8 +346,8 @@ def test_store_atlas01(atlas, sqlite, username):
 
 def test_store_atlas02(metatlas_dataset, username):
     atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
-    assert len(atlas_list) == 0
-    metatlas_dataset.store_atlas()
+    assert len(atlas_list) == 1
+    metatlas_dataset.store_atlas(even_if_exists=True)
     second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(second) == 1
 
@@ -476,7 +490,6 @@ def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas, usernam
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[atlas])
     mocker.patch("glob.glob", return_value=range(10))
     metatlas_dataset = mads.MetatlasDataset(analysis_ids)
     assert metatlas_dataset.atlas.name == f"505892_OakGall_final_POS_{username}0"
@@ -490,11 +503,19 @@ def test_get_atlas02(mocker, analysis_ids, caplog):
     assert "Database does not contain an atlas" in caplog.text
 
 
-def test_get_atlas03(mocker, analysis_ids, caplog):
+def test_get_atlas03(mocker, analysis_ids, caplog, username):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0, 0])
     with pytest.raises(ValueError):
         mads.MetatlasDataset(analysis_ids)
-    assert "Database contains more than one atlas" in caplog.text
+    assert (
+        f"2 atlases with name 505892_OakGall_final_POS_{username}0 and owned by {username} already exist."
+        in caplog.text
+    )
+
+
+def test_get_atlas04(metatlas_dataset, username):
+    atlases = metob.retrieve("Atlas", name="This_atlas_does_not_exists", username=username)
+    assert len(atlases) == 0
 
 
 def test_existing_groups(mocker, metatlas_dataset):

From 50d8348c120d78f923bf05428477213dcf0f2018 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 16 Jun 2021 07:54:46 -0700
Subject: [PATCH 022/177] WIP - cache groups

---
 metatlas/datastructures/metatlas_dataset.py | 41 ++++++++++++++++-----
 tests/unit/test_metatlas_dataset.py         |  4 ++
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 8ba2e3b6..e9544fb9 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -18,6 +18,9 @@
 from metatlas.plots import dill2plots as dp
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+POLARITIES = ['positive', 'negative', 'alternating']
+OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC"]
+
 logger = logging.getLogger(__name__)
 
 
@@ -56,10 +59,10 @@ def validate(self):
         get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
             raise ValueError('Parameter experiment does contain 9 fields when split on "_".')
-        if self.output_type not in ["ISTDsEtc", "FinalEMA-HILIC"]:
-            raise ValueError('Parameter output_type is not one of "ISTDsEtc" or "FinalEMA-HILIC".')
-        if self.polarity not in ["positive", "negative"]:
-            raise ValueError('Parameter polarity is not one of "positive" or "negative".')
+        if self.output_type not in OUTPUT_TYPES:
+            raise ValueError(f"Parameter output_type is not one of: {quoted_string_list(OUTPUT_TYPES)}.")
+        if self.polarity not in POLARITIES:
+            raise ValueError(f"Parameter polarity is not one of: {quoted_string_list(POLARITIES)}.")
         if not isinstance(self.analysis_number, numbers.Integral):
             raise TypeError("Parameter analysis_number is not an integer.")
         if self.analysis_number < 0:
@@ -117,6 +120,11 @@ def short_polarity(self):
         """Short polarity identifier: first 3 letters, upper case"""
         return self.polarity[:3].upper()
 
+    @property
+    def short_polarity_inverse(self):
+        """Returns the short_polarity values not used in this analysis"""
+        return list({[p[:3].upper() for p in POLARITIES]}-set([self.short_polarity]))
+
     @property
     def output_dir(self):
         """Creates the output directory and returns the path as a string"""
@@ -128,7 +136,7 @@ def output_dir(self):
 class MetatlasDataset:
     """
     Like the non-object oriented metatlas_dataset, you can index into this class by file_idx and compound_idx:
-    metatlas_dataset = MetatlasDataset(atlas, groups)
+    metatlas_dataset = MetatlasDataset(analysis_ids)
     metatlas_dataset[0][0]['identification'].compound[0].inchi_key
 
     But MetatlasDataset adds additional functionality, such as:
@@ -154,6 +162,7 @@ def __init__(
         frag_mz_tolerance=0.01,
         msms_refs_loc=MSMS_REFS_PATH,
         max_cpus=1,
+        save_metadata=True,
     ):
         """
         inputs:
@@ -161,6 +170,7 @@ def __init__(
             groups_controlled_vocab: array of strings that will group together when creating groups
                                      application of groups_controlled_vocab is case insensitive
             exclude_files: array of strings that will exclude files if they are substrings of the filename
+            save_metadata: if True, write metadata files containing data sources and LCMS runs short name
         """
         self.ids = ids
         self._atlas = None
@@ -173,6 +183,8 @@ def __init__(
         self._data_valid = False
         self._hits = None
         self._hits_valid = False
+        self._groups = None
+        self._groups_valid = False
         self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
         self._exclude_files = [] if exclude_files is None else exclude_files
         self._extra_time = extra_time
@@ -182,8 +194,9 @@ def __init__(
         self._msms_refs_loc = msms_refs_loc
         self.max_cpus = max_cpus
         self._get_atlas()
-        self.write_data_source_files()
-        self.write_lcmsruns_short_names()
+        if save_metadata:
+            self.write_data_source_files()
+            self.write_lcmsruns_short_names()
 
     def write_data_source_files(self):
         """Write the data source files if they don't already exist"""
@@ -690,10 +703,12 @@ def groups_dataframe(self):
     @property
     def groups(self):
         """Returns a list of Group objects"""
+        if self._groups_valid:
+            return self._groups
         file_dict = self._files_dict
-        out = []
+        self._groups = []
         for values in self.groups_dataframe.to_dict("index").values():
-            out.append(
+            self._groups.append(
                 metob.Group(
                     name=values["group"],
                     short_name=values["short_name"],
@@ -704,7 +719,8 @@ def groups(self):
                     ],
                 )
             )
-        return out
+        self._groups_valid = True
+        return self._groups
 
     def store_groups(self, exist_ok=False):
         """
@@ -867,3 +883,8 @@ def get_atlas(name, username):
         logger.exception(err)
         raise err
     return atlases[0]
+
+
+def quoted_string_list(strings):
+    """Adds double quotes around each string and seperates with ', '."""
+    return ', '.join([f'"{x}"' for x in strings])
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 1973b1cf..9b9963a2 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -569,3 +569,7 @@ def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
     metatlas_dataset.generate_all_outputs()
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 12
     assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 23
+
+
+def test_short_polarity_inverse01(analysis_ids):
+    assert analysis_ids.short_polarity_inverse == ['NEG', 'ALT']

From f765ff7e4005a19cd63ff99c65cdd6f23977cdaf Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 08:06:49 -0700
Subject: [PATCH 023/177] WIP - improved output generation

---
 metatlas/datastructures/metatlas_dataset.py | 55 +++++++++++----
 metatlas/io/targeted_output.py              |  2 +-
 metatlas/tools/fastanalysis.py              | 76 +++++++++++----------
 notebooks/reference/Targeted.ipynb          | 57 ++--------------
 pyproject.toml                              |  3 +
 tests/unit/test_metatlas_dataset.py         |  9 ++-
 6 files changed, 96 insertions(+), 106 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index e9544fb9..05ba6fa9 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -7,9 +7,11 @@
 import numbers
 import os
 import shutil
+import sys
 
 import humanize
 import pandas as pd
+import tqdm
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
@@ -18,7 +20,9 @@
 from metatlas.plots import dill2plots as dp
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
-POLARITIES = ['positive', 'negative', 'alternating']
+POLARITIES = ["positive", "negative", "fast-polarity-switching"]
+SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
+
 OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC"]
 
 logger = logging.getLogger(__name__)
@@ -56,7 +60,8 @@ def __init__(
 
     def validate(self):
         """Valid class inputs"""
-        get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
+        if self._source_atlas is not None:
+            get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
             raise ValueError('Parameter experiment does contain 9 fields when split on "_".')
         if self.output_type not in OUTPUT_TYPES:
@@ -117,13 +122,13 @@ def short_experiment_analysis(self):
 
     @property
     def short_polarity(self):
-        """Short polarity identifier: first 3 letters, upper case"""
-        return self.polarity[:3].upper()
+        """Short polarity identifier: 3 letters, upper case"""
+        return SHORT_POLARITIES[self.polarity]
 
     @property
     def short_polarity_inverse(self):
         """Returns the short_polarity values not used in this analysis"""
-        return list({[p[:3].upper() for p in POLARITIES]}-set([self.short_polarity]))
+        return list(set(SHORT_POLARITIES.values()) - {self.short_polarity})
 
     @property
     def output_dir(self):
@@ -193,10 +198,12 @@ def __init__(
         self._frag_mz_tolerance = frag_mz_tolerance
         self._msms_refs_loc = msms_refs_loc
         self.max_cpus = max_cpus
-        self._get_atlas()
+        if ids.source_atlas is not None:
+            self._get_atlas()
         if save_metadata:
             self.write_data_source_files()
             self.write_lcmsruns_short_names()
+        self.store_groups(exist_ok=True)
 
     def write_data_source_files(self):
         """Write the data source files if they don't already exist"""
@@ -260,6 +267,7 @@ def _get_atlas(self):
 
     def _build(self):
         """Populate self._data from database and h5 files."""
+        logger.info("Loading data into MetatlasDataset")
         start_time = datetime.datetime.now()
         files = []
         for group in self.groups:
@@ -274,11 +282,9 @@ def _build(self):
                         self.extra_mz,
                     )
                 )
-        if self.max_cpus > 1 and len(files) > 1:
-            with multiprocessing.Pool(processes=min(self.max_cpus, len(files))) as pool:
-                samples = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
-        else:  # skip multiprocessing as this makes for easier debugging
-            samples = [ma_data.get_data_for_atlas_df_and_file(i) for i in files]
+        samples = parallel_process(
+            ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample"
+        )
         self._data = tuple(MetatlasSample(x) for x in samples)
         logger.info(
             "MetatlasDataset with %d files built in %s.",
@@ -620,6 +626,8 @@ def lcmsruns(self):
             return self._runs
         self._runs = dp.get_metatlas_files(experiment=self.ids.experiment, name="%")
         self._runs_valid = True
+        for run in self._runs:
+            logger.info("Run: %s", run.name)
         logger.info("Number of LCMS output files matching '%s' is: %d.", self.ids.experiment, len(self._runs))
         return self._runs
 
@@ -692,7 +700,7 @@ def _files_dict(self):
 
     @property
     def groups_dataframe(self):
-        """Returns pandas Dataframe with one group per row"""
+        """Returns pandas Dataframe with one row per file"""
         out = pd.DataFrame(self._files_dict).T
         if out.empty:
             return out
@@ -707,7 +715,8 @@ def groups(self):
             return self._groups
         file_dict = self._files_dict
         self._groups = []
-        for values in self.groups_dataframe.to_dict("index").values():
+        unique_groups = self.groups_dataframe[["group", "short_name"]].drop_duplicates()
+        for values in unique_groups.to_dict("index").values():
             self._groups.append(
                 metob.Group(
                     name=values["group"],
@@ -778,6 +787,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             msms_fragment_ions: if True, generate msms fragment ions report
             overwrite: if False, throw error if any output files already exist
         """
+        self.extra_time = 0.5
         targeted_output.write_atlas_to_spreadsheet(self, overwrite)
         targeted_output.write_stats_table(self, overwrite)
         targeted_output.write_chromatograms(self, overwrite)
@@ -887,4 +897,21 @@ def get_atlas(name, username):
 
 def quoted_string_list(strings):
     """Adds double quotes around each string and seperates with ', '."""
-    return ', '.join([f'"{x}"' for x in strings])
+    return ", ".join([f'"{x}"' for x in strings])
+
+
+def parallel_process(function, data, max_cpus, unit=None):
+    """
+    performs map(function, data) using multiprocessing module but
+    adds a progress bar and bypasses multiprocessing in the 1 cpu case as this makes debugging easier
+    inputs:
+        function: the function to apply
+        data: iterater containing the inputs to function
+        max_cpus: number of cpus to use
+        unit: string label for what is processed in one iteration, default 'it'
+    """
+    kwargs = {"file": sys.stdout, "unit": unit, "colour": "green"}
+    if max_cpus > 1 and len(data) > 1:
+        with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
+            return list(tqdm.tqdm(pool.imap(function, data), length=len(data), **kwargs))
+    return [function(i) for i in tqdm.tqdm(data, **kwargs)]
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 322cc3ad..4a350134 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -20,7 +20,7 @@ def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
     """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
     export_atlas_filename = os.path.join(
         metatlas_dataset.ids.output_dir,
-        f"{metatlas_dataset.ids.short_polarity}_{metatlas_dataset.atlas.name}_export",
+        f"{metatlas_dataset.ids.short_polarity}_{metatlas_dataset.atlas.name}_export.csv",
     )
     write_utils.check_existing_file(export_atlas_filename, overwrite)
     dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)
diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index de9a4cdc..ed1e6625 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -58,24 +58,27 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         metatlas_dataset = ma_data.get_dill_data(os.path.expandvars(input_fname))
     else:
         metatlas_dataset = input_dataset
-
+    dataset = dp.filter_runs(metatlas_dataset, include_lcmsruns, include_groups,
+                             exclude_lcmsruns, exclude_groups)
     metrics = ['msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak', 'rt_delta',
                'peak_height', 'peak_area', 'num_data_points']
     dfs = {m: None for m in metrics}
     for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']:
-        dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname=metric, use_labels=use_labels, output_loc=os.path.join(output_loc, 'data_sheets'), polarity=polarity)
-    metatlas_dataset = dp.filter_runs(metatlas_dataset, include_lcmsruns, include_groups,
-                                      exclude_lcmsruns, exclude_groups)
+        dfs[metric] = dp.make_output_dataframe(input_dataset=dataset,
+                                               fieldname=metric,
+                                               use_labels=use_labels,
+                                               output_loc=os.path.join(output_loc, 'data_sheets'),
+                                               polarity=polarity)
     final_df = pd.DataFrame(columns=['index'])
-    file_names = ma_data.get_file_names(metatlas_dataset)
-    compound_names = ma_data.get_compound_names(metatlas_dataset, use_labels=use_labels)[0]
+    file_names = ma_data.get_file_names(dataset)
+    compound_names = ma_data.get_compound_names(dataset, use_labels=use_labels)[0]
     passing = {m: np.ones((len(compound_names), len(file_names))).astype(float) for m in metrics}
 
     dfs['mz_ppm'] = dfs['peak_height'].copy()
     dfs['mz_ppm'] *= np.nan
-    dfs['num_data_points'] = pd.DataFrame([[len(ma_data.extract(metatlas_dataset, [i, j, 'data', 'eic', 'intensity'], default=[]))
-                                           for i in range(len(metatlas_dataset))]
-                                          for j in range(len(metatlas_dataset[0]))])
+    dfs['num_data_points'] = pd.DataFrame([[len(ma_data.extract(dataset, [i, j, 'data', 'eic', 'intensity'], default=[]))
+                                           for i in range(len(dataset))]
+                                          for j in range(len(dataset[0]))])
     dfs['num_data_points'].index = dfs['mz_ppm'].index
     dfs['msms_score'] = dfs['mz_ppm'].copy()
     dfs['num_frag_matches'] = dfs['mz_ppm'].copy()
@@ -92,8 +95,8 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
 
     for compound_idx, compound_name in enumerate(compound_names):
 
-        ref_rt_peak = metatlas_dataset[0][compound_idx]['identification'].rt_references[0].rt_peak
-        ref_mz = metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz
+        ref_rt_peak = dataset[0][compound_idx]['identification'].rt_references[0].rt_peak
+        ref_mz = dataset[0][compound_idx]['identification'].mz_references[0].mz
 
         dfs['rt_delta'].iloc[compound_idx] = abs(ref_rt_peak - dfs['rt_peak'].iloc[compound_idx])
         passing['rt_delta'][compound_idx] = (abs(ref_rt_peak - np.nan_to_num(dfs['rt_peak'].iloc[compound_idx].values)) <= rt_tolerance).astype(float)
@@ -102,12 +105,12 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         passing['mz_ppm'][compound_idx] = (dfs['mz_ppm'].iloc[compound_idx].values <= ppm_tolerance).astype(float)
 
         try:
-            inchi_key = metatlas_dataset[0][compound_idx]['identification'].compound[0].inchi_key
+            inchi_key = dataset[0][compound_idx]['identification'].compound[0].inchi_key
         except:
             inchi_key = ''
-        compound_ref_rt_min = metatlas_dataset[0][compound_idx]['identification'].rt_references[0].rt_min
-        compound_ref_rt_max = metatlas_dataset[0][compound_idx]['identification'].rt_references[0].rt_max
-        cid = metatlas_dataset[0][compound_idx]['identification']
+        compound_ref_rt_min = dataset[0][compound_idx]['identification'].rt_references[0].rt_min
+        compound_ref_rt_max = dataset[0][compound_idx]['identification'].rt_references[0].rt_max
+        cid = dataset[0][compound_idx]['identification']
         mz_theoretical = cid.mz_references[0].mz
 
         comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) \
@@ -119,7 +122,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         comp_msms_hits = comp_msms_hits.sort_values('score', ascending=False)
         file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], []
         if len(comp_msms_hits) > 0 and not np.isnan(np.concatenate(comp_msms_hits['msv_ref_aligned'].values, axis=1)).all():
-            file_idxs = [file_names.index(f) for f in comp_msms_hits['file_name']]
+            file_idxs = [file_names.index(f) for f in comp_msms_hits['file_name'] if f in file_names]
             scores = comp_msms_hits['score'].values.tolist()
             msv_sample_list = comp_msms_hits['msv_query_aligned'].values.tolist()
             msv_ref_list = comp_msms_hits['msv_ref_aligned'].values.tolist()
@@ -130,14 +133,14 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         avg_rt_measured = []
         intensities = pd.DataFrame()
         for file_idx, file_name in enumerate(file_names):
-            if metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']:
-                if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['mz_centroid']):
-                    avg_mz_measured.append(metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['mz_centroid'])
-                if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['rt_peak']):
-                    avg_rt_measured.append(metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['rt_peak'])
-                if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['peak_height']):
+            if dataset[file_idx][compound_idx]['data']['ms1_summary']:
+                if not np.isnan(dataset[file_idx][compound_idx]['data']['ms1_summary']['mz_centroid']):
+                    avg_mz_measured.append(dataset[file_idx][compound_idx]['data']['ms1_summary']['mz_centroid'])
+                if not np.isnan(dataset[file_idx][compound_idx]['data']['ms1_summary']['rt_peak']):
+                    avg_rt_measured.append(dataset[file_idx][compound_idx]['data']['ms1_summary']['rt_peak'])
+                if not np.isnan(dataset[file_idx][compound_idx]['data']['ms1_summary']['peak_height']):
                     intensities.loc[file_idx, 'file_id'] = file_idx
-                    intensities.loc[file_idx, 'intensity'] = metatlas_dataset[file_idx][compound_idx]['data']['ms1_summary']['peak_height']
+                    intensities.loc[file_idx, 'intensity'] = dataset[file_idx][compound_idx]['data']['ms1_summary']['peak_height']
 
         avg_mz_measured = np.mean(avg_mz_measured)
         avg_rt_measured = np.mean(avg_rt_measured)
@@ -159,20 +162,20 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         inchi_key_map = {}
         for compound_iterator in range(len(compound_names)):
             if use_labels:
-                cpd_labels.append(metatlas_dataset[0][compound_iterator]['identification'].name)
+                cpd_labels.append(dataset[0][compound_iterator]['identification'].name)
             else:
-                cpd_labels.append(metatlas_dataset[0][compound_iterator]['identification'].compound[0].name)
+                cpd_labels.append(dataset[0][compound_iterator]['identification'].compound[0].name)
 
         if(len(cid.compound) != 0):
             #Loop through compounds to identify overlapping compounds
             for compound_iterator in range(len(compound_names)):
-                if len(metatlas_dataset[0][compound_iterator]['identification'].compound) == 0:
+                if len(dataset[0][compound_iterator]['identification'].compound) == 0:
                     continue
                 if use_labels:
-                    cpd_iter_label = metatlas_dataset[0][compound_iterator]['identification'].name
+                    cpd_iter_label = dataset[0][compound_iterator]['identification'].name
                 else:
-                    cpd_iter_label = metatlas_dataset[0][compound_iterator]['identification'].compound[0].name
-                cpd_iter_id = metatlas_dataset[0][compound_iterator]['identification']
+                    cpd_iter_label = dataset[0][compound_iterator]['identification'].compound[0].name
+                cpd_iter_id = dataset[0][compound_iterator]['identification']
                 cpd_iter_mz = cpd_iter_id.mz_references[0].mz
                 cid_mass = cid.compound[0].mono_isotopic_molecular_weight
                 cpd_iter_mass = cpd_iter_id.compound[0].mono_isotopic_molecular_weight
@@ -258,7 +261,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[intensities['intensity'].idxmax()]['intensity']
             max_intensity_file_id = int(intensities.loc[intensities['intensity'].idxmax()]['file_id'])
             final_df.loc[compound_idx, 'max_intensity_file'] = file_names[max_intensity_file_id]
-            final_df.loc[compound_idx, 'ms1_rt_peak'] = metatlas_dataset[max_intensity_file_id][compound_idx]['identification'].rt_references[0].rt_peak
+            final_df.loc[compound_idx, 'ms1_rt_peak'] = dataset[max_intensity_file_id][compound_idx]['identification'].rt_references[0].rt_peak
         else:
             final_df.loc[compound_idx, 'max_intensity'] = ""
             final_df.loc[compound_idx, 'max_intensity_file'] = ""
@@ -290,16 +293,17 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         final_df.loc[compound_idx, 'rt_measured'] = float("%.2f" % avg_rt_measured)
         final_df.loc[compound_idx, 'rt_error'] = float("%.2f" % abs(cid.rt_references[0].rt_peak - avg_rt_measured))
 
-
         for file_idx, file_name in enumerate(file_names):
             if len(msms_hits_df) == 0:
                 rows = []
             else:
-                rows = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) & \
-                                (msms_hits_df['file_name'] == file_name) & \
-                                (msms_hits_df['msms_scan'] >= compound_ref_rt_min) & (msms_hits_df['msms_scan'] <= compound_ref_rt_max) & \
-                                ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \
-                                   <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)]
+                mz_ref = dataset[0][compound_idx]['identification'].mz_references[0]
+                rows = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) &
+                                    (msms_hits_df['file_name'] == file_name) &
+                                    (msms_hits_df['msms_scan'] >= compound_ref_rt_min) &
+                                    (msms_hits_df['msms_scan'] <= compound_ref_rt_max) &
+                                    ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float)
+                                      - mz_ref[0].mz)/mz_ref[0].mz) <= mz_ref[0].mz_tolerance*1e-6)]
 
             if len(rows) == 0:
                 dfs['msms_score'].iat[compound_idx, file_idx] = np.nan
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 32dd87f4..638efea1 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -153,29 +153,6 @@
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Select groups of files to operate on\n",
-    "\n",
-    "Here, you will assign your database groups to a local variable which will be used downstream in the notebook for analyzing your data with an atlas.\n",
-    "\n",
-    "1. in block below, fill out the fields for name, include_list and exclude_list using text strings from the group names you created in the previous step.  The include/exlcude lists do not need wildcards.  Name is a string unique to all of your groups (ex. fields 0-11 of your filenames)\n",
-    "\n",
-    "### Typically, you will run one polarity at a time."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Select Atlas to use\n",
-    "\n",
-    "1. The first block will retrieve a list of atlases matching the 'name' string that you enter.  Also, you must enter your username.\n",
-    "2. The next block will select one from the list, using the index number.  Make sure to enter the index number for the atlas you want to use for your analysis by setting in this line: atlas_idx = 0"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -184,15 +161,7 @@
    "source": [
     "metatlas_dataset = mads.MetatlasDataset(\n",
     "    ids, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files\n",
-    ")\n",
-    "metatlas_dataset.lcmsruns_dataframe[\"name\"].tolist()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Optional: Filter atlas for compounds with no or low signals"
+    ")"
    ]
   },
   {
@@ -208,36 +177,20 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "agui = metatlas_dataset.annotation_gui(compound_idx=0, width=15, height=3, colors=line_colors)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Export results files\n",
-    "### Filter out compounds with ms1_notes of 'remove'"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "metatlas_dataset.filter_compounds_ms1_notes_remove()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Export Outputs"
+    "if metatlas_dataset.ids.output_type in [\"FinalEMA-HILIC\"]:\n",
+    "    metatlas_dataset.filter_compounds_ms1_notes_remove()"
    ]
   },
   {
@@ -267,7 +220,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.4"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index ab5095d1..3bc7895d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,5 +7,8 @@ disable = "C0330, C0326"
 [tool.pylint.format]
 max-line-length = "110"
 
+[tool.pylint.TYPECHECK]
+generated-members = "pandas.*"
+
 [tool.pytest.ini_options]
 filterwarnings = "ignore::DeprecationWarning:dataset.*:"
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 9b9963a2..01ecf8fd 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -387,7 +387,7 @@ def test_analysis_identifiers01(sqlite):
 
 def test_analysis_identifiers02(sqlite_with_atlas, username):
     with pytest.raises(
-        ValueError, match='Parameter output_type is not one of "ISTDsEtc" or "FinalEMA-HILIC".'
+        ValueError, match='Parameter output_type is not one of: "ISTDsEtc", "FinalEMA-HILIC".'
     ):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
@@ -400,7 +400,10 @@ def test_analysis_identifiers02(sqlite_with_atlas, username):
 
 
 def test_analysis_identifiers03(username, sqlite_with_atlas):
-    with pytest.raises(ValueError, match='Parameter polarity is not one of "positive" or "negative".'):
+    with pytest.raises(
+        ValueError,
+        match='Parameter polarity is not one of: "positive", "negative", "fast-polarity-switching".',
+    ):
         mads.AnalysisIdentifiers(
             f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
             "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -572,4 +575,4 @@ def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
 
 
 def test_short_polarity_inverse01(analysis_ids):
-    assert analysis_ids.short_polarity_inverse == ['NEG', 'ALT']
+    assert set(analysis_ids.short_polarity_inverse) == {"NEG", "FPS"}

From 1e8eff48cad82a7f3f867ddea45896c0c51efdcd Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 08:53:25 -0700
Subject: [PATCH 024/177] WIP - add analysis_setup.sh script

---
 metatlas/scripts/analysis_setup.sh | 26 ++++++++++++++++++++++++++
 notebooks/reference/Targeted.ipynb |  2 +-
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100755 metatlas/scripts/analysis_setup.sh

diff --git a/metatlas/scripts/analysis_setup.sh b/metatlas/scripts/analysis_setup.sh
new file mode 100755
index 00000000..ad30530f
--- /dev/null
+++ b/metatlas/scripts/analysis_setup.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -euf -o pipefail
+
+if [[ $# -ne 3 ]]; then
+	echo "Usage: $0 metatlas_repo_dir base_output_dir experiment_id"
+	exit 1
+fi
+
+REPO_DIR="$1"
+OUT_DIR="$2"
+EXP="$3"
+
+DATA_DIR="/project/projectdirs/metatlas/raw_data/akuftin/${EXP}"
+
+if [ ! -d "${DATA_DIR}" ]; then
+	echo "ERROR: could not find data directory ${DATA_DIR}." >&2
+	exit 2
+fi
+
+IFS='_' read -ra EXP_ARRAY <<< "$EXP"
+NOTEBOOK_BASE="${EXP_ARRAY[3]}_${EXP_ARRAY[4]}"
+
+mkdir "$OUT_DIR"
+cp "${REPO_DIR}/notebooks/reference/Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb" "${OUT_DIR}/${NOTEBOOK_BASE}_RT_Predict.ipynb"
+cp "${REPO_DIR}/notebooks/reference/Targeted.ipynb" "${OUT_DIR}/${NOTEBOOK_BASE}.ipynb"
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 638efea1..561009f5 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -22,7 +22,7 @@
     "# pylint: disable=invalid-name,missing-module-docstring\n",
     "\n",
     "# source atlas name\n",
-    "source_atlas = \"REPLACE ME\"\n",
+    "source_atlas = None\n",
     "\n",
     "# this atlas will be copied to an atlas named projectId_experimentName_sampleSet_polarity_analysisId\n",
     "# where projectId is JGI Proposal ID Number\n",

From 6a80cc612ed574c22995398c8a11cad96ec9eb7f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 09:12:29 -0700
Subject: [PATCH 025/177] WIP - fix analysis_setup.sh

---
 metatlas/scripts/analysis_setup.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/metatlas/scripts/analysis_setup.sh b/metatlas/scripts/analysis_setup.sh
index ad30530f..edc6af91 100755
--- a/metatlas/scripts/analysis_setup.sh
+++ b/metatlas/scripts/analysis_setup.sh
@@ -21,6 +21,6 @@ fi
 IFS='_' read -ra EXP_ARRAY <<< "$EXP"
 NOTEBOOK_BASE="${EXP_ARRAY[3]}_${EXP_ARRAY[4]}"
 
-mkdir "$OUT_DIR"
-cp "${REPO_DIR}/notebooks/reference/Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb" "${OUT_DIR}/${NOTEBOOK_BASE}_RT_Predict.ipynb"
-cp "${REPO_DIR}/notebooks/reference/Targeted.ipynb" "${OUT_DIR}/${NOTEBOOK_BASE}.ipynb"
+mkdir -p "${OUT_DIR}/${EXP}"
+cp "${REPO_DIR}/notebooks/reference/Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb" "${OUT_DIR}/${EXP}/${NOTEBOOK_BASE}_RT_Predict.ipynb"
+cp "${REPO_DIR}/notebooks/reference/Targeted.ipynb" "${OUT_DIR}/${EXP}/${NOTEBOOK_BASE}.ipynb"

From 4e144a84778b307da8035ac8fdcc10011d0a11a3 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 15:02:46 -0700
Subject: [PATCH 026/177] WIP - fix indexing error in fastanalysis

---
 metatlas/tools/fastanalysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index ed1e6625..9d02cbbd 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -303,7 +303,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
                                     (msms_hits_df['msms_scan'] >= compound_ref_rt_min) &
                                     (msms_hits_df['msms_scan'] <= compound_ref_rt_max) &
                                     ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float)
-                                      - mz_ref[0].mz)/mz_ref[0].mz) <= mz_ref[0].mz_tolerance*1e-6)]
+                                      - mz_ref.mz)/mz_ref.mz) <= mz_ref.mz_tolerance*1e-6)]
 
             if len(rows) == 0:
                 dfs['msms_score'].iat[compound_idx, file_idx] = np.nan

From 66cf0906ebf57e668d6e8dead435192d77143279 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 15:13:35 -0700
Subject: [PATCH 027/177] WIP - add output_type field to dest atlas name

---
 metatlas/datastructures/metatlas_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 05ba6fa9..1588beb4 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -102,7 +102,7 @@ def analysis_number(self):
     def atlas(self):
         """Atlas identifier (name)"""
         exp_tokens = self.experiment.split("_")
-        return f"{'_'.join(exp_tokens[3:6])}_{self.short_polarity}_{self.analysis}"
+        return f"{'_'.join(exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
 
     @property
     def username(self):

From aa3d5e939394ea14e1015ee672140123b87f35d1 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 17 Jun 2021 15:38:42 -0700
Subject: [PATCH 028/177] WIP - create tar.gz output file

---
 metatlas/datastructures/metatlas_dataset.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 1588beb4..74c2be5f 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -8,6 +8,7 @@
 import os
 import shutil
 import sys
+import tarfile
 
 import humanize
 import pandas as pd
@@ -118,7 +119,7 @@ def analysis(self):
     def short_experiment_analysis(self):
         """Short experiment analysis identifier"""
         exp_tokens = self.experiment.split("_")
-        return f"{exp_tokens[0]}_{exp_tokens[3]}_{self.analysis}"
+        return f"{exp_tokens[0]}_{exp_tokens[3]}_{self.output_type}_{self.analysis}"
 
     @property
     def short_polarity(self):
@@ -788,6 +789,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             overwrite: if False, throw error if any output files already exist
         """
         self.extra_time = 0.5
+        logger.info("extra_time set to 0.5 minutes for output generation.")
         targeted_output.write_atlas_to_spreadsheet(self, overwrite)
         targeted_output.write_stats_table(self, overwrite)
         targeted_output.write_chromatograms(self, overwrite)
@@ -795,6 +797,13 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         targeted_output.write_metrics_and_boxplots(self, overwrite)
         if msms_fragment_ions:
             targeted_output.write_msms_fragment_ions(self, overwrite)
+        logger.info("Generation of output files completed sucessfully.")
+        logger.info("Generating archive of output files.")
+        output_path = os.path.join(
+            self.ids.project_directory, self.experiment, f"{self.ids.short_experiment_analysis}.tar.gz"
+        )
+        with tarfile.open(output_path, "w:gz") as tar:
+            tar.add(self.ids.output_dir, arcname=os.path.basename(self.ids.output_dir))
 
 
 class MetatlasSample:

From 15d994e07a55478e457c2464b7c2f53a4fe8c7c3 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 21 Jun 2021 10:24:24 -0700
Subject: [PATCH 029/177] WIP - add notebook variable setup scripting

---
 metatlas/datastructures/metatlas_dataset.py |   2 +-
 metatlas/scripts/analysis_setup.sh          |  21 +++-
 metatlas/tools/environment.py               | 114 ++++++++++++++++++++
 tests/unit/test_metatlas_dataset.py         |   8 +-
 4 files changed, 136 insertions(+), 9 deletions(-)
 create mode 100644 metatlas/tools/environment.py

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 74c2be5f..5285cccb 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -800,7 +800,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         logger.info("Generation of output files completed sucessfully.")
         logger.info("Generating archive of output files.")
         output_path = os.path.join(
-            self.ids.project_directory, self.experiment, f"{self.ids.short_experiment_analysis}.tar.gz"
+            self.ids.project_directory, self.ids.experiment, f"{self.ids.short_experiment_analysis}.tar.gz"
         )
         with tarfile.open(output_path, "w:gz") as tar:
             tar.add(self.ids.output_dir, arcname=os.path.basename(self.ids.output_dir))
diff --git a/metatlas/scripts/analysis_setup.sh b/metatlas/scripts/analysis_setup.sh
index edc6af91..3c472cf1 100755
--- a/metatlas/scripts/analysis_setup.sh
+++ b/metatlas/scripts/analysis_setup.sh
@@ -3,21 +3,34 @@
 set -euf -o pipefail
 
 if [[ $# -ne 3 ]]; then
-	echo "Usage: $0 metatlas_repo_dir base_output_dir experiment_id"
-	exit 1
+  echo "Usage: $0 metatlas_repo_dir base_output_dir experiment_id"
+  exit 1
 fi
 
 REPO_DIR="$1"
 OUT_DIR="$2"
 EXP="$3"
 
+function install_kernel () {
+  local REPO_DIR="$1"
+  local SOURCE="${REPO_DIR}/notebooks/kernels/metatlas-targeted.kernel.json"
+  local DEST="${HOME}/.local/share/jupyter/kernels/metatlas-targeted/kernel.json"
+  if [[ ! -f "$DEST" ]]; then
+    mkdir -p $(dirname "$DEST")
+    cp "$SOURCE" "$DEST"
+  fi
+}
+
+function validate_data_dir () {
+  local EXP="$1"
 DATA_DIR="/project/projectdirs/metatlas/raw_data/akuftin/${EXP}"
 
 if [ ! -d "${DATA_DIR}" ]; then
-	echo "ERROR: could not find data directory ${DATA_DIR}." >&2
-	exit 2
+  echo "ERROR: could not find data directory ${DATA_DIR}." >&2
+  exit 2
 fi
 
+function populate_experiment_dir () {
 IFS='_' read -ra EXP_ARRAY <<< "$EXP"
 NOTEBOOK_BASE="${EXP_ARRAY[3]}_${EXP_ARRAY[4]}"
 
diff --git a/metatlas/tools/environment.py b/metatlas/tools/environment.py
new file mode 100644
index 00000000..aabcc48f
--- /dev/null
+++ b/metatlas/tools/environment.py
@@ -0,0 +1,114 @@
+"""Environment setup functions"""
+
+import getpass
+import json
+import logging
+import os
+import re
+import shutil
+import sys
+
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def install_kernel():
+    """
+    Copies kernel.json from repo to active location under home directory.
+    Only for use on NERC!
+    """
+    logger.info('Installing kernel.json for "Metatlas Targeted".')
+    repo_path = Path(__file__).resolve().parent.parent.parent
+    source = repo_path / "notebooks" / "kernels" / "metatlas-targeted.kernel.json"
+    dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
+    os.makedirs(dest_dir, exist_ok=True)
+    shutil.copyfile(source, dest_dir / "kernel.json")
+    logger.info(
+            'Kernel installation complete. Reload Jupyter notebook page to see new kernel". '
+    )
+
+
+def repo_dir():
+    """Returns a string with the path to the root of the Metatlas git repo"""
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def create_all_notebooks(output_type, base_output_dir, experiment_id, analysis_number):
+    """
+    Creates Jupyter notebooks with appropriate filename and pre-populated parameters
+    inputs:
+        output_type: one of 'RT-Predict', 'ISTDsEtc', 'FinalEMA-HILIC'
+        base_output_dir: project directory containing the experiment directories
+        experiment_id: '_' delimited experiment identifier
+        analysis_number: increment to not overwrite existing analysis outputs
+    """
+    possible_outputs = ['RT-Predict', 'ISTDsEtc', 'FinalEMA-HILIC']
+    outputs = possible_outputs[:(1+possible_outputs.index(output_type))]
+    source_notebook = {'RT-Predict': 'Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb',
+                       'ISTDsEtc': 'Targeted.ipynb',
+                       'FinalEMA-HILIC': 'Targeted.ipynb'}
+    source_atlas_prefix = {
+            'RT-Predict': None,
+            'ISTDsEtc': 'HILICz150_ANT20190824_PRD_IS_LabUnlab2_',
+            'FinalEMA-HILIC': 'HILICz150_ANT20190824_TPL_EMA_Unlab_'}
+    parameters = {
+            'experiment': experiment_id,
+            'metatlas_repo_path': repo_dir(),
+            'output_directory': base_output_dir,
+            'analysis_number': analysis_number}
+    analysis_id = f"{getpass.getuser()}{parameters['analysis_number']}"
+    tokens = parameters['experiment'].split('_')
+    output_dir = os.path.join(base_output_dir, experiment_id)
+    os.makedirs(output_dir, exist_ok=True)
+    for output in outputs:
+        parameters['output_type'] = output
+        for polarity in (['positive', 'negative'] if output != 'RT-Predict' else [None]):
+            source = os.path.join(repo_dir(), 'notebooks', 'reference', source_notebook[output])
+            if polarity is not None:
+                parameters['polarity'] = polarity
+                pol = polarity[:3].upper()
+                parameters['source_atlas'] = f"{source_atlas_prefix[output]}_{pol}_{tokens[3]}_{analysis_id}"
+            generate_notebook(source, output_dir, parameters)
+
+
+def generate_notebook(source, output_dir, parameters):
+    """
+    Creates a notebook from source in output_dir that has updated parameters.
+    inputs:
+        source: path of input Jupyter notebook
+        output_dir: directory to write output Jupyter notebook
+        parameters: dict of parameters to update in the notebook
+    parameters must have atleast the following keys: analysis_number, experiment, output_type
+    """
+    if 'polarity' in parameters:
+        pol = parameters['polarity'][:3].upper()
+        suffix = f"{parameters['output_type']}_{pol}"
+    else:
+        suffix = 'RT-Predict'
+    tokens = parameters['experiment'].split('_')
+    dest = os.path.join(output_dir, '_'.join(tokens[3:5]+[suffix])+'.ipynb')
+    create_notebook_with_parameters(source, dest, parameters)
+
+
+def create_notebook_with_parameters(source, dest, parameters):
+    with open(source) as source_fh:
+        data = json.load(source_fh)
+    eq_pat = re.compile('^[^#]')
+    for line in data['cells'][1]['source']:
+        if '=' in line:
+            print(line)
+
+
+def validate_data_dir(base_data_dir, experiment_id):
+    """Raise FileNotFoundError if base_data_dir / experiment_id is not an existing directory"""
+    experiment_dir = os.path.join(base_data_dir, experiment_id)
+    try:
+        if not os.path.isdir(experiment_dir):
+            raise FileNotFoundError(f"Data directory does not exist at {experiment_dir}.")
+    except FileNotFoundError as err:
+        logger.exception(err)
+        raise err
+
+source = '/global/homes/w/wjholtz/metatlas-dev/notebooks/reference/Targeted.ipynb'
+create_notebook_with_parameters(source, 'foo', {})
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 01ecf8fd..3b7c83c1 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -463,13 +463,13 @@ def test_analysis_identifiers07(username, sqlite_with_atlas):
 
 
 def test_analysis_identifiers_atlas01(analysis_ids, username):
-    assert analysis_ids.atlas == f"505892_OakGall_final_POS_{username}0"
+    assert analysis_ids.atlas == f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
 
 
 def test_analysis_identifiers_atlas02(analysis_ids, username):
     # call .atlas twice to get cached value
     analysis_ids.atlas  # pylint: disable=pointless-statement
-    assert analysis_ids.atlas == f"505892_OakGall_final_POS_{username}0"
+    assert analysis_ids.atlas == f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
 
 
 def test_write_data_source_files01(metatlas_dataset, mocker, caplog):
@@ -495,7 +495,7 @@ def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas, usernam
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     mocker.patch("glob.glob", return_value=range(10))
     metatlas_dataset = mads.MetatlasDataset(analysis_ids)
-    assert metatlas_dataset.atlas.name == f"505892_OakGall_final_POS_{username}0"
+    assert metatlas_dataset.atlas.name == f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
 
 
 def test_get_atlas02(mocker, analysis_ids, caplog):
@@ -511,7 +511,7 @@ def test_get_atlas03(mocker, analysis_ids, caplog, username):
     with pytest.raises(ValueError):
         mads.MetatlasDataset(analysis_ids)
     assert (
-        f"2 atlases with name 505892_OakGall_final_POS_{username}0 and owned by {username} already exist."
+        f"2 atlases with name 505892_OakGall_final_FinalEMA-HILIC_POS_{username}0 and owned by {username} already exist."
         in caplog.text
     )
 

From 0673e207549e0099d237fed80323ebb94418b66a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 21 Jun 2021 10:22:01 -0700
Subject: [PATCH 030/177] WIP - move rt_predict code to package

---
 metatlas/tools/predict_rt.py | 320 +++++++++++++++++++++++++++++++++++
 1 file changed, 320 insertions(+)
 create mode 100644 metatlas/tools/predict_rt.py

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
new file mode 100644
index 00000000..401ab6e0
--- /dev/null
+++ b/metatlas/tools/predict_rt.py
@@ -0,0 +1,320 @@
+"""Generate Retention Time Correction Model"""
+
+import os
+import sys
+import time
+import multiprocessing
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from metatlas.datastructures import metatlas_dataset as mads
+from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.plots import dill2plots as dp
+
+
+def generate_rt_correction_models(ids, groups_controlled_vocab, exclude_files, include_groups):
+    metatlas_dataset = mads.MetatlasDataset(ids, groups_controlled_vocab, exclude_files, save_metadata=False)
+    groups = get_groups(metatlas_dataset, include_groups)
+    files_df = get_files_df(groups)
+    qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
+
+
+def get_groups(metatlas_dataset, include_groups):
+    metatlas_dataset.store_groups(exist_ok=True)
+    ids = metatlas_dataset.ids
+    groups = dp.select_groups_for_analysis(
+        name=f"{ids.experiment}_{ids.short_polarity}_%",
+        most_recent=True,
+        remove_empty=True,
+        include_list=include_groups,
+        exclude_list=ids.short_polarity_inverse,
+    )
+    return sorted(groups, key=lambda x: x.name)
+
+
+def get_files_df(groups):
+    files_df = pd.DataFrame(columns=["file", "time", "group"])
+    for group in groups:
+        for run in group.items:
+            time = run.accquistion_time if hasattr(run, "acquisition_time") else 0
+            files_df = files_df.append({"file": run, "time": time, "group": group}, ignore_index=True)
+    return files_df.sort_values(by=["time"])
+
+
+def get_qc_atlas(ids):
+    qc_atlas_name = f"HILICz150_ANT20190824_TPL_QCv3_Unlab_{ids.short_polarity}"
+    atlas = metob.retrieve('Atlas', name=qc_atlas_name, username='vrsingan')[0]
+    atlas_df = ma_data.make_atlas_df(atlas)
+    atlas_df['label'] = [cid.name for cid in atlas.compound_identifications]
+    return atlas, atlas_df
+
+
+def load_runs(metatlas_dataset, files_df, qc_atlas_df, qc_atlas):
+    files = []
+    for file_data in files_df.iterrows():
+        files.append((file_data[1].file, file_data[1].group, qc_atlas_df, qc_atlas))
+    if metatlas_dataset.max_cpus > 1 and len(files) > 1:
+        with multiprocessing.Pool(processes=min(metatlas_dataset.max_cpus, len(files))) as pool:
+            data = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
+    else:  # skip multiprocessing as this makes for easier debugging
+        data = [ma_data.get_data_for_atlas_df_and_file(i) for i in files]
+    return data
+
+
+rts_df = dp.make_output_dataframe(
+    input_dataset=metatlas_dataset,
+    fieldname="rt_peak",
+    use_labels=True,
+    output_loc=output_data_qc,
+    summarize=True,
+)
+rts_df.to_csv(os.path.join(output_data_qc, "QC_Measured_RTs.csv"))
+
+import itertools
+import math
+from __future__ import division
+from matplotlib import gridspec
+import matplotlib.ticker as mticker
+
+rts_df["atlas RT peak"] = [
+    compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
+]
+# number of columns in rts_df that are not values from a specific input file
+num_not_files = len(rts_df.columns) - len(metatlas_dataset)
+rts_df_plot = (
+    rts_df.sort_values(by="standard deviation", ascending=False, na_position="last")
+    .drop(["#NaNs"], axis=1)
+    .dropna(axis=0, how="all", subset=rts_df.columns[:-num_not_files])
+)
+
+fontsize = 2
+pad = 0.1
+cols = 8
+rows = int(math.ceil((rts_df.shape[0] + 1) / 8))
+
+fig = plt.figure()
+gs = gridspec.GridSpec(rows, cols, figure=fig, wspace=0.2, hspace=0.4)
+for i, (index, row) in enumerate(rts_df_plot.iterrows()):
+    ax = fig.add_subplot(gs[i])
+    ax.tick_params(direction="in", length=1, pad=pad, width=0.1, labelsize=fontsize)
+    ax.scatter(range(rts_df_plot.shape[1] - num_not_files), row[:-num_not_files], s=0.2)
+    ticks_loc = np.arange(0, len(rts_df_plot.columns) - num_not_files, 1.0)
+    ax.axhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
+    ax.set_xlim(-0.5, len(rts_df_plot.columns) - num_not_files + 0.5)
+    ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
+    range_columns = list(rts_df_plot.columns[:-num_not_files]) + ["atlas RT peak"]
+    ax.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanmax(row.loc[range_columns]) + 0.12)
+    [s.set_linewidth(0.1) for s in ax.spines.values()]
+    # truncate name so it fits above a single subplot
+    ax.set_title(row.name[:33], pad=pad, fontsize=fontsize)
+    ax.set_xlabel("Files", labelpad=pad, fontsize=fontsize)
+    ax.set_ylabel("Actual RTs", labelpad=pad, fontsize=fontsize)
+
+plt.savefig(os.path.join(output_data_qc, "Compound_Atlas_RTs.pdf"), bbox_inches="tight")
+
+for i, a in enumerate(rts_df.columns):
+    print(i, a)
+
+selected_column = 9
+
+from sklearn.linear_model import LinearRegression, RANSACRegressor
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.metrics import mean_absolute_error as mae
+
+actual_rts, pred_rts, polyfit_rts = [], [], []
+
+current_actual_df = rts_df.loc[:, rts_df.columns[selected_column]]
+bad_qc_compounds = np.where(~np.isnan(current_actual_df))
+current_actual_df = current_actual_df.iloc[bad_qc_compounds]
+current_pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
+actual_rts.append(current_actual_df.values.tolist())
+pred_rts.append(current_pred_df.values.tolist())
+
+ransac = RANSACRegressor(random_state=42)
+rt_model_linear = ransac.fit(current_pred_df, current_actual_df)
+coef_linear = rt_model_linear.estimator_.coef_[0]
+intercept_linear = rt_model_linear.estimator_.intercept_
+
+poly_reg = PolynomialFeatures(degree=2)
+X_poly = poly_reg.fit_transform(current_pred_df)
+rt_model_poly = LinearRegression().fit(X_poly, current_actual_df)
+coef_poly = rt_model_poly.coef_
+intercept_poly = rt_model_poly.intercept_
+
+for i in range(rts_df.shape[1] - 5):
+    current_actual_df = rts_df.loc[:, rts_df.columns[i]]
+    bad_qc_compounds = np.where(~np.isnan(current_actual_df))
+    current_actual_df = current_actual_df.iloc[bad_qc_compounds]
+    current_pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
+    actual_rts.append(current_actual_df.values.tolist())
+    pred_rts.append(current_pred_df.values.tolist())
+
+# User can change to use particular qc file
+import itertools
+import math
+from __future__ import division
+from matplotlib import gridspec
+
+x = list(itertools.chain(*pred_rts))
+y = list(itertools.chain(*actual_rts))
+
+rows = int(math.ceil((rts_df.shape[1] + 1) / 5))
+cols = 5
+fig = plt.figure(constrained_layout=False)
+
+gs = gridspec.GridSpec(rows, cols, figure=fig)
+plt.rc("font", size=6)
+plt.rc("axes", labelsize=6)
+plt.rc("xtick", labelsize=3)
+plt.rc("ytick", labelsize=3)
+
+
+for i in range(rts_df.shape[1] - 5):
+    x = list(itertools.chain(*pred_rts[i]))
+    y = actual_rts[i]
+
+    ax = fig.add_subplot(gs[i])
+    ax.scatter(x, y, s=2)
+    ax.plot(
+        np.linspace(0, max(x), 100),
+        coef_linear * np.linspace(0, max(x), 100) + intercept_linear,
+        linewidth=0.5,
+        color="red",
+    )
+    ax.plot(
+        np.linspace(0, max(x), 100),
+        (coef_poly[1] * np.linspace(0, max(x), 100))
+        + (coef_poly[2] * (np.linspace(0, max(x), 100) ** 2))
+        + intercept_poly,
+        linewidth=0.5,
+        color="green",
+    )
+    ax.set_title("File: " + str(i))
+    ax.set_xlabel("predicted RTs")
+    ax.set_ylabel("actual RTs")
+
+fig_legend = "FileIndex       FileName"
+for i in range(rts_df.shape[1] - 5):
+    fig_legend = fig_legend + "\n" + str(i) + "        " + rts_df.columns[i]
+
+fig.tight_layout(pad=0.5)
+plt.text(0, -0.03 * rts_df.shape[1], fig_legend, transform=plt.gcf().transFigure)
+plt.savefig(os.path.join(output_data_qc, "Actual_vs_Predicted_RTs.pdf"), bbox_inches="tight")
+
+qc_df = rts_df[[rts_df.columns[selected_column]]]
+qc_df = qc_df.copy()
+print("Linear Parameters :", coef_linear, intercept_linear)
+print("Polynomial Parameters :", coef_poly, intercept_poly)
+
+qc_df.columns = ["RT Measured"]
+atlas_df.index = qc_df.index
+qc_df["RT Reference"] = atlas_df["rt_peak"]
+qc_df["RT Linear Pred"] = qc_df["RT Reference"].apply(lambda rt: coef_linear * rt + intercept_linear)
+qc_df["RT Polynomial Pred"] = qc_df["RT Reference"].apply(
+    lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
+)
+qc_df["RT Diff Linear"] = qc_df["RT Measured"] - qc_df["RT Linear Pred"]
+qc_df["RT Diff Polynomial"] = qc_df["RT Measured"] - qc_df["RT Polynomial Pred"]
+qc_df.to_csv(os.path.join(output_data_qc, "RT_Predicted_Model_Comparison.csv"))
+
+qc_df
+
+# CHOOSE YOUR MODEL HERE (linear / polynomial).
+# model = 'linear'
+model = "polynomial"
+
+# Save model
+
+with open(os.path.join(output_data_qc, "rt_model.txt"), "w") as f:
+    if model == "linear":
+        f.write(
+            "coef = {}\nintercept = {}\nqc_actual_rts = {}\nqc_predicted_rts = {}".format(
+                coef_linear, intercept_linear, ", ".join([g.name for g in groups]), myAtlas.name
+            )
+        )
+        f.write("\n" + repr(rt_model_linear.set_params()))
+
+    else:
+        f.write(
+            "coef = {}\nintercept = {}\nqc_actual_rts = {}\nqc_predicted_rts = {}".format(
+                coef_poly, intercept_poly, ", ".join([g.name for g in groups]), myAtlas.name
+            )
+        )
+        f.write("\n" + repr(rt_model_poly.set_params()))
+
+
+pos_atlas_indices = [0, 1, 2, 3, 4]
+neg_atlas_indices = [0, 1, 2, 3, 4]
+free_text = ""  # this will be appended to the end of the csv filename exported
+save_to_db = False
+
+for ix in pos_atlas_indices:
+    atlases = metob.retrieve("Atlas", name=pos_templates[ix], username="vrsingan")
+    prd_atlas_name = pos_templates[ix].replace("TPL", "PRD")
+    if free_text != "":
+        prd_atlas_name = prd_atlas_name + "_" + free_text
+    prd_atlas_filename = prd_atlas_name + ".csv"
+    myAtlas = atlases[-1]
+    PRD_atlas_df = ma_data.make_atlas_df(myAtlas)
+    PRD_atlas_df["label"] = [cid.name for cid in myAtlas.compound_identifications]
+    if model == "linear":
+        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
+            lambda rt: coef_linear * rt + intercept_linear
+        )
+    else:
+        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
+            lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
+        )
+    PRD_atlas_df["rt_min"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
+    PRD_atlas_df["rt_max"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
+
+    PRD_atlas_df.to_csv(os.path.join(output_data_qc, prd_atlas_filename), index=False)
+
+    if save_to_db:
+        dp.make_atlas_from_spreadsheet(
+            PRD_atlas_df,
+            prd_atlas_name,
+            filetype="dataframe",
+            sheetname="",
+            polarity="positive",
+            store=True,
+            mz_tolerance=12,
+        )
+    print(prd_atlas_name + " Created!")
+
+for ix in neg_atlas_indices:
+    atlases = metob.retrieve("Atlas", name=neg_templates[ix], username="vrsingan")
+    prd_atlas_name = neg_templates[ix].replace("TPL", "PRD")
+    if free_text != "":
+        prd_atlas_name = prd_atlas_name + "_" + free_text
+    prd_atlas_filename = prd_atlas_name + ".csv"
+    myAtlas = atlases[-1]
+    PRD_atlas_df = ma_data.make_atlas_df(myAtlas)
+    PRD_atlas_df["label"] = [cid.name for cid in myAtlas.compound_identifications]
+    if model == "linear":
+        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
+            lambda rt: coef_linear * rt + intercept_linear
+        )
+    else:
+        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
+            lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
+        )
+    PRD_atlas_df["rt_min"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
+    PRD_atlas_df["rt_max"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
+
+    PRD_atlas_df.to_csv(os.path.join(output_data_qc, prd_atlas_filename), index=False)
+
+    if save_to_db:
+        dp.make_atlas_from_spreadsheet(
+            PRD_atlas_df,
+            prd_atlas_name,
+            filetype="dataframe",
+            sheetname="",
+            polarity="negative",
+            store=True,
+            mz_tolerance=12,
+        )
+
+    print(prd_atlas_name + " Created!")

From 607021550505415fbfa635b62c16534c20f33d86 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 25 Jun 2021 22:16:37 -0700
Subject: [PATCH 031/177] WIP - BROKEN, move rt predict code to package

Try to fix DB issues, but unit tests still failing.
---
 docker/requirements.txt                   |   1 +
 metatlas/datastructures/object_helpers.py | 127 +++--
 metatlas/tools/environment.py             |  85 +--
 metatlas/tools/notebook.py                |  24 +-
 metatlas/tools/predict_rt.py              | 625 ++++++++++++----------
 noxfile.py                                |   2 +
 tests/system/test_targeted.py             |   4 +-
 tests/unit/conftest.py                    |  31 +-
 tests/unit/test_environment.py            |  71 +++
 tests/unit/test_metatlas_dataset.py       |  20 +-
 10 files changed, 616 insertions(+), 374 deletions(-)
 create mode 100644 tests/unit/test_environment.py

diff --git a/docker/requirements.txt b/docker/requirements.txt
index 0922070a..88188325 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -16,6 +16,7 @@ pip==21.1.1
 pymysql==1.0.2
 pyyaml==5.4.1
 rdkit-pypi==2021.3.1.5
+scikit-learn==0.24.2
 scipy==1.6.3
 sqlalchemy==1.4.11
 tables==3.6.1
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index ae0c816a..f1fa81ce 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -146,6 +146,7 @@ def __init__(self):
                 self.path = f"sqlite:///{filename}"
                 if os.path.exists(filename):
                     os.chmod(filename, 0o775)
+        logging.warning('Using database at: %s', self.path)
 
         self.tablename_lut = dict()
         self.subclass_lut = dict()
@@ -163,16 +164,38 @@ def __init__(self):
         self.seen = dict()
         Workspace.instance = self
 
+    def get_connection(self):
+        """
+        Get a re-useable connection to the database.
+        Each activity that queries the database needs to have this function preceeding it.
+        """
+        try:
+            if self.db.engine.name == 'mysql':
+                self.db.query('show tables')
+            else:
+                self.db.query('SELECT name FROM sqlite_master WHERE type = "table"')
+        except Exception:
+            self.db = dataset.connect(self.path)
+
+    def close_connection(self):
+        self.db.close()
+        self.db = None
+
     def convert_to_double(self, table, entry):
         """Convert a table column to double type."""
-        with dataset.connect(self.path) as trans:
-            try:
-                trans.query('alter table `%s` modify `%s` double' % (table, entry))
-            except Exception as e:
-                print(e)
+        self.get_connection()
+        self.db.begin()
+        try:
+            self.db.query('alter table `%s` modify `%s` double' % (table, entry))
+            self.db.commit()
+        except Exception as e:
+            self.db.rollback()
+            print(e)
+            logging.error('Transaction rollback within convert_to_double()')
 
     def save_objects(self, objects, _override=False):
         """Save objects to the database"""
+        logging.warning('Entering Workspace.save_objects')
         if not isinstance(objects, (list, set)):
             objects = [objects]
         self._seen = dict()
@@ -181,44 +204,60 @@ def save_objects(self, objects, _override=False):
         self._inserts = defaultdict(list)
         for obj in objects:
             self._get_save_data(obj, _override)
-        with dataset.connect(self.path) as trans:
+        logging.warning('Workspace._inserts=%s', self._inserts)
+        self.get_connection()
+        self.db.begin()
+        try:
             for (table_name, updates) in self._link_updates.items():
-                if table_name not in trans:
+                if table_name not in self.db:
                     continue
                 for (uid, prev_uid) in updates:
-                    trans.query('update `%s` set source_id = "%s" where source_id = "%s"' % (table_name, prev_uid, uid))
+                    self.db.query('update `%s` set source_id = "%s" where source_id = "%s"' %
+                                  (table_name, prev_uid, uid))
             for (table_name, updates) in self._updates.items():
-                if '_' not in table_name and table_name not in trans:
-                    trans.create_table(table_name, primary_id='unique_id',
-                                       primary_type=trans.types.string(32))
+                if '_' not in table_name and table_name not in self.db:
+                    self.db.create_table(table_name, primary_id='unique_id',
+                                         primary_type=self.db.types.string(32))
                     if 'sqlite' not in self.path:
                         self.fix_table(table_name)
                 for (uid, prev_uid) in updates:
-                    trans.query('update `%s` set unique_id = "%s" where unique_id = "%s"' % (table_name, prev_uid, uid))
+                    self.db.query('update `%s` set unique_id = "%s" where unique_id = "%s"' %
+                                  (table_name, prev_uid, uid))
             for (table_name, inserts) in self._inserts.items():
-                if '_' not in table_name and table_name not in trans:
-                    trans.create_table(table_name, primary_id='unique_id',
-                                       primary_type=trans.types.string(32))
+                if '_' not in table_name and table_name not in self.db:
+                    self.db.create_table(table_name, primary_id='unique_id',
+                                         primary_type=self.db.types.string(32))
                     if 'sqlite' not in self.path:
                         self.fix_table(table_name)
-                trans[table_name].insert_many(inserts)
+                self.db[table_name].insert_many(inserts)
+                logging.warning('inserting %s', inserts)
+            self.db.commit()
+        except Exception:
+            self.db.rollback()
+            logging.error('Transaction rollback within save_objects()')
 
     def create_link_tables(self, klass):
         """
         Create a link table in the database of the given trait klass
         """
         name = self.table_name[klass]
-        with dataset.connect(self.path) as trans:
+        self.get_connection()
+        self.db.begin()
+        try:
             for (tname, trait) in klass.class_traits().items():
                 if isinstance(trait, MetList):
                     table_name = '_'.join([name, tname])
-                    if table_name not in trans:
-                        trans.create_table(table_name)
+                    if table_name not in self.db:
+                        self.db.create_table(table_name)
                         link = dict(source_id=uuid.uuid4().hex,
                                     head_id=uuid.uuid4().hex,
                                     target_id=uuid.uuid4().hex,
                                     target_table=uuid.uuid4().hex)
-                        trans[table_name].insert(link)
+                        self.db[table_name].insert(link)
+            self.db.commit()
+        except Exception:
+            self.db.rollback()
+            logging.error('Transaction rollback within create_link_tables()')
 
     def _get_save_data(self, obj, override=False):
         """Get the data that will be used to save an object to the database"""
@@ -290,8 +329,11 @@ def retrieve(self, object_type, **kwargs):
         """Retrieve an object from the database."""
         object_type = object_type.lower()
         klass = self.subclass_lut.get(object_type, None)
-        with dataset.connect(self.path) as trans:
-            if object_type not in trans:
+        items = []
+        self.get_connection()
+        self.db.begin()
+        try:
+            if object_type not in self.db:
                 if not klass:
                     raise ValueError('Unknown object type: %s' % object_type)
                 object_type = self.tablename_lut[klass]
@@ -329,7 +371,7 @@ def retrieve(self, object_type, **kwargs):
             if not clauses:
                 query = query.replace(' where ()', '')
             try:
-                items = [i for i in trans.query(query)]
+                items = list(self.db.query(query))
             except Exception as e:
                 if 'Unknown column' in str(e):
                     keys = [k for k in klass.class_traits().keys()
@@ -345,13 +387,13 @@ def retrieve(self, object_type, **kwargs):
             for (tname, trait) in items[0].traits().items():
                 if isinstance(trait, List):
                     table_name = '_'.join([object_type, tname])
-                    if table_name not in trans:
+                    if table_name not in self.db:
                         for i in items:
                             setattr(i, tname, [])
                         continue
                     querystr = 'select * from `%s` where source_id in ("' % table_name
                     querystr += '" , "'.join(uids)
-                    result = trans.query(querystr + '")')
+                    result = self.db.query(querystr + '")')
                     sublist = defaultdict(list)
                     for r in result:
                         stub = Stub(unique_id=r['target_id'],
@@ -366,7 +408,10 @@ def retrieve(self, object_type, **kwargs):
                     i.prev_uid = 'origin'
                 i._changed = False
             items.sort(key=lambda x: x.last_modified)
-
+            self.db.commit()
+        except Exception:
+            self.db.rollback()
+            logging.error('Transaction rollback within retrieve()')
         return items
 
     def remove(self, object_type, **kwargs):
@@ -404,32 +449,38 @@ def remove(self, object_type, **kwargs):
         query += ')'
         if not clauses:
             query = query.replace(' where ()', '')
-        with dataset.connect(self.path) as trans:
+        self.get_connection()
+        self.db.begin()
+        try:
             # check for lists items that need removal
             if any([isinstance(i, MetList) for i in klass.class_traits().values()]):
                 uid_query = query.replace('delete ', 'select unique_id ')
-                uids = [i['unique_id'] for i in trans.query(uid_query)]
+                uids = [i['unique_id'] for i in self.db.query(uid_query)]
                 sub_query = 'delete from `%s` where source_id in ("%s")'
                 for (tname, trait) in klass.class_traits().items():
                     table_name = '%s_%s' % (object_type, tname)
-                    if not uids or table_name not in trans:
+                    if not uids or table_name not in self.db:
                         continue
                     if isinstance(trait, MetList):
                         table_query = sub_query % (table_name, '", "'.join(uids))
                         try:
-                            trans.query(table_query)
+                            self.db.query(table_query)
                         except Exception as e:
                             print(e)
             try:
-                trans.query(query)
+                self.db.query(query)
             except Exception as e:
                 if 'Unknown column' in str(e):
                     keys = [k for k in klass.class_traits().keys()
                             if not k.startswith('_')]
                     raise ValueError('Invalid column name, valid columns: %s' % keys)
                 else:
-                    raise(e)
+                    raise e
             print('Removed')
+            self.db.commit()
+        except Exception:
+            self.db.rollback()
+            logging.error('Transaction rollback within retrieve()')
 
     def remove_objects(self, objects, all_versions=True, **kwargs):
         """Remove a list of objects from the database."""
@@ -449,7 +500,9 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
         ids = defaultdict(list)
         username = getpass.getuser()
         attr = 'head_id' if all_versions else 'unique_id'
-        with dataset.connect(self.path) as trans:
+        self.get_connection()
+        self.db.begin()
+        try:
             for obj in objects:
                 if not override and obj.username != username:
                     continue
@@ -461,14 +514,18 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
                         subname = '%s_%s' % (name, tname)
                         ids[subname].append(getattr(obj, attr))
             for (table_name, uids) in ids.items():
-                if table_name not in trans:
+                if table_name not in self.db:
                     continue
                 query = 'delete from `%s` where %s in ("'
                 query = query % (table_name, attr)
                 query += '" , "'.join(uids)
                 query += '")'
-                trans.query(query)
+                self.db.query(query)
             print(('Removed %s object(s)' % len(objects)))
+            self.db.commit()
+        except Exception:
+            self.db.rollback()
+            logging.error('Transaction rollback within remove_objects()')
 
 
 def format_timestamp(tstamp):
diff --git a/metatlas/tools/environment.py b/metatlas/tools/environment.py
index aabcc48f..d6dd5fe7 100644
--- a/metatlas/tools/environment.py
+++ b/metatlas/tools/environment.py
@@ -6,12 +6,23 @@
 import os
 import re
 import shutil
-import sys
 
 from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
+SOURCE_NOTEBOOK = {
+    "RT-Predict": "Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb",
+    "ISTDsEtc": "Targeted.ipynb",
+    "FinalEMA-HILIC": "Targeted.ipynb",
+}
+
+SOURCE_ATLAS_PREFIX = {
+    "RT-Predict": None,
+    "ISTDsEtc": "HILICz150_ANT20190824_PRD_IS_LabUnlab2_",
+    "FinalEMA-HILIC": "HILICz150_ANT20190824_TPL_EMA_Unlab_",
+}
+
 
 def install_kernel():
     """
@@ -24,9 +35,7 @@ def install_kernel():
     dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
     os.makedirs(dest_dir, exist_ok=True)
     shutil.copyfile(source, dest_dir / "kernel.json")
-    logger.info(
-            'Kernel installation complete. Reload Jupyter notebook page to see new kernel". '
-    )
+    logger.info('Kernel installation complete. Reload Jupyter notebook page to see new kernel". ')
 
 
 def repo_dir():
@@ -43,32 +52,26 @@ def create_all_notebooks(output_type, base_output_dir, experiment_id, analysis_n
         experiment_id: '_' delimited experiment identifier
         analysis_number: increment to not overwrite existing analysis outputs
     """
-    possible_outputs = ['RT-Predict', 'ISTDsEtc', 'FinalEMA-HILIC']
-    outputs = possible_outputs[:(1+possible_outputs.index(output_type))]
-    source_notebook = {'RT-Predict': 'Workflow_Notebook_VS_Auto_RT_Predict_V2.ipynb',
-                       'ISTDsEtc': 'Targeted.ipynb',
-                       'FinalEMA-HILIC': 'Targeted.ipynb'}
-    source_atlas_prefix = {
-            'RT-Predict': None,
-            'ISTDsEtc': 'HILICz150_ANT20190824_PRD_IS_LabUnlab2_',
-            'FinalEMA-HILIC': 'HILICz150_ANT20190824_TPL_EMA_Unlab_'}
+    possible_outputs = ["RT-Predict", "ISTDsEtc", "FinalEMA-HILIC"]
+    outputs = possible_outputs[: (1 + possible_outputs.index(output_type))]
     parameters = {
-            'experiment': experiment_id,
-            'metatlas_repo_path': repo_dir(),
-            'output_directory': base_output_dir,
-            'analysis_number': analysis_number}
+        "experiment": experiment_id,
+        "metatlas_repo_path": repo_dir(),
+        "output_directory": base_output_dir,
+        "analysis_number": analysis_number,
+    }
     analysis_id = f"{getpass.getuser()}{parameters['analysis_number']}"
-    tokens = parameters['experiment'].split('_')
+    tokens = parameters["experiment"].split("_")
     output_dir = os.path.join(base_output_dir, experiment_id)
     os.makedirs(output_dir, exist_ok=True)
     for output in outputs:
-        parameters['output_type'] = output
-        for polarity in (['positive', 'negative'] if output != 'RT-Predict' else [None]):
-            source = os.path.join(repo_dir(), 'notebooks', 'reference', source_notebook[output])
+        parameters["output_type"] = output
+        for polarity in ["positive", "negative"] if output != "RT-Predict" else [None]:
+            source = os.path.join(repo_dir(), "notebooks", "reference", SOURCE_NOTEBOOK[output])
             if polarity is not None:
-                parameters['polarity'] = polarity
+                parameters["polarity"] = polarity
                 pol = polarity[:3].upper()
-                parameters['source_atlas'] = f"{source_atlas_prefix[output]}_{pol}_{tokens[3]}_{analysis_id}"
+                parameters["source_atlas"] = f"{SOURCE_ATLAS_PREFIX[output]}_{pol}_{tokens[3]}_{analysis_id}"
             generate_notebook(source, output_dir, parameters)
 
 
@@ -81,23 +84,38 @@ def generate_notebook(source, output_dir, parameters):
         parameters: dict of parameters to update in the notebook
     parameters must have atleast the following keys: analysis_number, experiment, output_type
     """
-    if 'polarity' in parameters:
-        pol = parameters['polarity'][:3].upper()
+    if "polarity" in parameters:
+        pol = parameters["polarity"][:3].upper()
         suffix = f"{parameters['output_type']}_{pol}"
     else:
-        suffix = 'RT-Predict'
-    tokens = parameters['experiment'].split('_')
-    dest = os.path.join(output_dir, '_'.join(tokens[3:5]+[suffix])+'.ipynb')
+        suffix = "RT-Predict"
+    tokens = parameters["experiment"].split("_")
+    dest = os.path.join(output_dir, "_".join(tokens[3:5] + [suffix]) + ".ipynb")
     create_notebook_with_parameters(source, dest, parameters)
 
 
 def create_notebook_with_parameters(source, dest, parameters):
+    """
+    Copies source notebook to dest and updates parameters
+    inputs:
+        source: path of input notebook
+        dest: path of destination notebook
+        parameters: dict with name of parameter in key and new value in value
+    """
     with open(source) as source_fh:
         data = json.load(source_fh)
-    eq_pat = re.compile('^[^#]')
-    for line in data['cells'][1]['source']:
-        if '=' in line:
-            print(line)
+    eq_pat = re.compile(r"^([^#= ]+)\s*=.+$")
+    param_source = data["cells"][1]["source"]
+    for i, line in enumerate(param_source):
+        re_match = eq_pat.match(line)
+        if re_match:
+            param_name = re_match.group(1)
+            if param_name in parameters:
+                new_value = parameters[param_name]
+                out_value = f"'{new_value}'" if isinstance(new_value, str) else new_value
+                param_source[i] = f"{param_name} = {out_value}\n"
+    with open(dest, "w") as out_fh:
+        json.dump(data, out_fh)
 
 
 def validate_data_dir(base_data_dir, experiment_id):
@@ -109,6 +127,3 @@ def validate_data_dir(base_data_dir, experiment_id):
     except FileNotFoundError as err:
         logger.exception(err)
         raise err
-
-source = '/global/homes/w/wjholtz/metatlas-dev/notebooks/reference/Targeted.ipynb'
-create_notebook_with_parameters(source, 'foo', {})
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index d9a8c21d..8d44da7f 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -2,14 +2,14 @@
 
 import logging
 import os
-import shutil
 import sys
 
-from pathlib import Path
 import pandas as pd
 from IPython.core.display import display, HTML
 from metatlas.tools.logging import activate_logging
 from metatlas.tools.logging import activate_module_logging
+from metatlas.tools.environment import install_kernel
+
 
 logger = logging.getLogger(__name__)
 
@@ -48,26 +48,6 @@ def validate_kernel():
         raise ModuleNotFoundError from module_error
 
 
-def install_kernel():
-    """
-    Copies kernel.json from repo to active location under home directory.
-    Only for use on NERC!
-    """
-    logger.info('Installing kernel.json for "Metatlas Targeted".')
-    repo_path = Path(__file__).resolve().parent.parent.parent
-    source = repo_path / "notebooks" / "kernels" / "metatlas-targeted.kernel.json"
-    dest_dir = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted"
-    os.makedirs(dest_dir, exist_ok=True)
-    shutil.copyfile(source, dest_dir / "kernel.json")
-    logger.info(
-        (
-            'Reload the page and then change kernel to "Metatlas Targeted". '
-            "On the menu bar at the top of this page select 'Kernel'>'Change Kernel..' "
-            "then find 'Metatlas Targeted' in the drop down list."
-        )
-    )
-
-
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
     """Set pandas display options"""
     pd.set_option("display.max_rows", max_rows)
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 401ab6e0..44af1fa6 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -1,27 +1,128 @@
 """Generate Retention Time Correction Model"""
+# pylint: disable=too-many-arguments
 
+import itertools
+import math
 import os
-import sys
-import time
-import multiprocessing
+
 import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
 import numpy as np
 import pandas as pd
 
+from matplotlib import gridspec
+from sklearn.linear_model import LinearRegression, RANSACRegressor
+from sklearn.preprocessing import PolynomialFeatures
+
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 from metatlas.plots import dill2plots as dp
 
 
-def generate_rt_correction_models(ids, groups_controlled_vocab, exclude_files, include_groups):
+TEMPLATES = {
+    "postive": [
+        "HILICz150_ANT20190824_TPL_EMA_Unlab_POS",
+        "HILICz150_ANT20190824_TPL_QCv3_Unlab_POS",
+        "HILICz150_ANT20190824_TPL_ISv5_Unlab_POS",
+        "HILICz150_ANT20190824_TPL_ISv5_13C15N_POS",
+        "HILICz150_ANT20190824_TPL_IS_LabUnlab2_POS",
+    ],
+    "negative": [
+        "HILICz150_ANT20190824_TPL_EMA_Unlab_NEG",
+        "HILICz150_ANT20190824_TPL_QCv3_Unlab_NEG",
+        "HILICz150_ANT20190824_TPL_ISv5_Unlab_NEG",
+        "HILICz150_ANT20190824_TPL_ISv5_13C15N_NEG",
+        "HILICz150_ANT20190824_TPL_IS_LabUnlab2_NEG",
+    ],
+}
+
+
+class Model:
+    """Encapsulate both linear and polynomial models in a consistent interface"""
+
+    def __init__(self, sk_model, intercept, coefficents):
+        """
+        inputs:
+            sk_model: scikit-learn model object
+            intercept: y-intercept value
+            coefficents: a list of coefficents, with x^n coefficent at index n-1
+        """
+        self.sk_model = sk_model
+        self.intercept = intercept
+        if isinstance(coefficents, list):
+            self.coefficents = coefficents
+        else:
+            self.coefficents = [coefficents]
+
+    def __repr__(self):
+        """Text description of the model function"""
+        if self.order == 1:
+            return f"Linear model with intercept={self.intercept:.3f} and slope={self.coefficents[0]:.5f}"
+        coef_str = ", ".join([f"{c:.5f}" for c in self.coefficents])
+        return f"Polynomial model with intercept={self.intercept:.3f} and coefficents=[{coef_str}]"
+
+    @property
+    def order(self):
+        """Polynomial order of the model"""
+        return len(self.coefficents)
+
+    @property
+    def name(self):
+        """Type of model as string"""
+        return "linear" if self.order == 1 else "polynomial"
+
+    def predict(self, x_values):
+        """Returns y values for input x"""
+        return self.sk_model.predict(x_values)
+
+
+def generate_rt_correction_models(
+    ids, groups_controlled_vocab, exclude_files, include_groups, cpus, save_to_db=True
+):
+    """
+    Generate the RT correction models and associated atlases with adjusted RT values
+    inputs:
+        ids: an AnalysisIds object matching the one used in the main notebook
+        groups_controlled_vocab: list of strings that will group together when creating groups
+                                 application of groups_controlled_vocab is case insensitive
+        exclude_files: list of strings that will exclude files if they are substrings of the filename
+        include_groups: group will only be used in correction if their name has a substring match
+                        to this list of strings
+        cpus: max number of cpus to use
+        save_to_db: If True, save the new atlases to the database
+    """
+    # pylint: disable=too-many-locals
     metatlas_dataset = mads.MetatlasDataset(ids, groups_controlled_vocab, exclude_files, save_metadata=False)
+    qc_dir = os.path.join(ids.output_dir, "data_QC")
     groups = get_groups(metatlas_dataset, include_groups)
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
+    metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
+    save_measured_rts(metatlas_dataset, os.path.join(qc_dir, "QC_Measured_RTs.csv"))
+    rts_df = get_rts(metatlas_dataset)
+    plot_compound_atlas_rts(metatlas_dataset, rts_df, os.path.join(qc_dir, "Compound_Atlas_RTs.pdf"))
+    selected_column = 9  # need to deal with this parameter, index from rts_df.columns
+    actual_df, pred_df = actual_and_predicted_df(selected_column, rts_df, qc_atlas_df)
+    linear, poly = generate_models(actual_df, pred_df)
+    actual_rts, pred_rts = actual_and_predicted_rts(rts_df, qc_atlas_df, actual_df, pred_df)
+    actual_vs_pred_file_name = os.path.join(qc_dir, "Actual_vs_Predicted_RTs.pdf")
+    plot_actual_vs_pred_rts(pred_rts, actual_rts, rts_df, actual_vs_pred_file_name, linear, poly)
+    rt_comparison_file_name = os.path.join(qc_dir, "RT_Predicted_Model_Comparison.csv")
+    save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, rt_comparison_file_name)
+    models_file_name = os.path.join(qc_dir, "rt_model.txt")
+    write_models(models_file_name, linear, poly, groups, qc_atlas)
+    create_adjusted_atlases(linear, poly, qc_dir, save_to_db=save_to_db)
 
 
 def get_groups(metatlas_dataset, include_groups):
+    """
+    Create all experiment groups if they don't already exist and return the subset matching include_list
+    inputs:
+        metatlas_datset: instance of MetatlasDataset
+        include_groups: group will only be used in correction if their name has a substring match
+                        to this list of strings
+    """
     metatlas_dataset.store_groups(exist_ok=True)
     ids = metatlas_dataset.ids
     groups = dp.select_groups_for_analysis(
@@ -35,286 +136,270 @@ def get_groups(metatlas_dataset, include_groups):
 
 
 def get_files_df(groups):
+    """Pandas Datafram with one row per file plus columns for accquistion_time and group name"""
     files_df = pd.DataFrame(columns=["file", "time", "group"])
     for group in groups:
         for run in group.items:
-            time = run.accquistion_time if hasattr(run, "acquisition_time") else 0
+            try:
+                time = run.accquistion_time
+            except AttributeError:
+                time = 0
             files_df = files_df.append({"file": run, "time": time, "group": group}, ignore_index=True)
     return files_df.sort_values(by=["time"])
 
 
 def get_qc_atlas(ids):
+    """Retreives template QC atlas and return tuple (atlas, atlas_df)"""
     qc_atlas_name = f"HILICz150_ANT20190824_TPL_QCv3_Unlab_{ids.short_polarity}"
-    atlas = metob.retrieve('Atlas', name=qc_atlas_name, username='vrsingan')[0]
+    atlas = metob.retrieve("Atlas", name=qc_atlas_name, username="vrsingan")[0]
     atlas_df = ma_data.make_atlas_df(atlas)
-    atlas_df['label'] = [cid.name for cid in atlas.compound_identifications]
+    atlas_df["label"] = [cid.name for cid in atlas.compound_identifications]
     return atlas, atlas_df
 
 
-def load_runs(metatlas_dataset, files_df, qc_atlas_df, qc_atlas):
-    files = []
-    for file_data in files_df.iterrows():
-        files.append((file_data[1].file, file_data[1].group, qc_atlas_df, qc_atlas))
-    if metatlas_dataset.max_cpus > 1 and len(files) > 1:
-        with multiprocessing.Pool(processes=min(metatlas_dataset.max_cpus, len(files))) as pool:
-            data = pool.map(ma_data.get_data_for_atlas_df_and_file, files)
-    else:  # skip multiprocessing as this makes for easier debugging
-        data = [ma_data.get_data_for_atlas_df_and_file(i) for i in files]
-    return data
-
-
-rts_df = dp.make_output_dataframe(
-    input_dataset=metatlas_dataset,
-    fieldname="rt_peak",
-    use_labels=True,
-    output_loc=output_data_qc,
-    summarize=True,
-)
-rts_df.to_csv(os.path.join(output_data_qc, "QC_Measured_RTs.csv"))
-
-import itertools
-import math
-from __future__ import division
-from matplotlib import gridspec
-import matplotlib.ticker as mticker
-
-rts_df["atlas RT peak"] = [
-    compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
-]
-# number of columns in rts_df that are not values from a specific input file
-num_not_files = len(rts_df.columns) - len(metatlas_dataset)
-rts_df_plot = (
-    rts_df.sort_values(by="standard deviation", ascending=False, na_position="last")
-    .drop(["#NaNs"], axis=1)
-    .dropna(axis=0, how="all", subset=rts_df.columns[:-num_not_files])
-)
-
-fontsize = 2
-pad = 0.1
-cols = 8
-rows = int(math.ceil((rts_df.shape[0] + 1) / 8))
-
-fig = plt.figure()
-gs = gridspec.GridSpec(rows, cols, figure=fig, wspace=0.2, hspace=0.4)
-for i, (index, row) in enumerate(rts_df_plot.iterrows()):
-    ax = fig.add_subplot(gs[i])
-    ax.tick_params(direction="in", length=1, pad=pad, width=0.1, labelsize=fontsize)
-    ax.scatter(range(rts_df_plot.shape[1] - num_not_files), row[:-num_not_files], s=0.2)
-    ticks_loc = np.arange(0, len(rts_df_plot.columns) - num_not_files, 1.0)
-    ax.axhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
-    ax.set_xlim(-0.5, len(rts_df_plot.columns) - num_not_files + 0.5)
-    ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
-    range_columns = list(rts_df_plot.columns[:-num_not_files]) + ["atlas RT peak"]
-    ax.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanmax(row.loc[range_columns]) + 0.12)
-    [s.set_linewidth(0.1) for s in ax.spines.values()]
-    # truncate name so it fits above a single subplot
-    ax.set_title(row.name[:33], pad=pad, fontsize=fontsize)
-    ax.set_xlabel("Files", labelpad=pad, fontsize=fontsize)
-    ax.set_ylabel("Actual RTs", labelpad=pad, fontsize=fontsize)
-
-plt.savefig(os.path.join(output_data_qc, "Compound_Atlas_RTs.pdf"), bbox_inches="tight")
-
-for i, a in enumerate(rts_df.columns):
-    print(i, a)
-
-selected_column = 9
-
-from sklearn.linear_model import LinearRegression, RANSACRegressor
-from sklearn.preprocessing import PolynomialFeatures
-from sklearn.metrics import mean_absolute_error as mae
-
-actual_rts, pred_rts, polyfit_rts = [], [], []
-
-current_actual_df = rts_df.loc[:, rts_df.columns[selected_column]]
-bad_qc_compounds = np.where(~np.isnan(current_actual_df))
-current_actual_df = current_actual_df.iloc[bad_qc_compounds]
-current_pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
-actual_rts.append(current_actual_df.values.tolist())
-pred_rts.append(current_pred_df.values.tolist())
-
-ransac = RANSACRegressor(random_state=42)
-rt_model_linear = ransac.fit(current_pred_df, current_actual_df)
-coef_linear = rt_model_linear.estimator_.coef_[0]
-intercept_linear = rt_model_linear.estimator_.intercept_
-
-poly_reg = PolynomialFeatures(degree=2)
-X_poly = poly_reg.fit_transform(current_pred_df)
-rt_model_poly = LinearRegression().fit(X_poly, current_actual_df)
-coef_poly = rt_model_poly.coef_
-intercept_poly = rt_model_poly.intercept_
-
-for i in range(rts_df.shape[1] - 5):
-    current_actual_df = rts_df.loc[:, rts_df.columns[i]]
-    bad_qc_compounds = np.where(~np.isnan(current_actual_df))
-    current_actual_df = current_actual_df.iloc[bad_qc_compounds]
-    current_pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
-    actual_rts.append(current_actual_df.values.tolist())
-    pred_rts.append(current_pred_df.values.tolist())
-
-# User can change to use particular qc file
-import itertools
-import math
-from __future__ import division
-from matplotlib import gridspec
-
-x = list(itertools.chain(*pred_rts))
-y = list(itertools.chain(*actual_rts))
-
-rows = int(math.ceil((rts_df.shape[1] + 1) / 5))
-cols = 5
-fig = plt.figure(constrained_layout=False)
-
-gs = gridspec.GridSpec(rows, cols, figure=fig)
-plt.rc("font", size=6)
-plt.rc("axes", labelsize=6)
-plt.rc("xtick", labelsize=3)
-plt.rc("ytick", labelsize=3)
-
+def load_runs(files_df, qc_atlas_df, qc_atlas, cpus):
+    """
+    Loads MSMS data file files
+    inputs:
+        files_df: files to load
+        qc_atlas_df: dataframe form of the QC atlas
+        qc_atlas: atlas of QC compounds
+        cpus: number of cpus to use
+    """
+    files = [(i[1].file, i[1].group, qc_atlas_df, qc_atlas) for i in files_df.iterrows()]
+    return mads.parallel_process(ma_data.get_data_for_atlas_df_and_file, files, cpus, unit="sample")
+
+
+def save_measured_rts(metatlas_dataset, filename):
+    """Save RT values in csv format file"""
+    rts_df = dp.make_output_dataframe(
+        input_dataset=metatlas_dataset,
+        fieldname="rt_peak",
+        use_labels=True,
+        summarize=True,
+    )
+    rts_df.to_csv(filename)
 
-for i in range(rts_df.shape[1] - 5):
-    x = list(itertools.chain(*pred_rts[i]))
-    y = actual_rts[i]
 
-    ax = fig.add_subplot(gs[i])
-    ax.scatter(x, y, s=2)
-    ax.plot(
-        np.linspace(0, max(x), 100),
-        coef_linear * np.linspace(0, max(x), 100) + intercept_linear,
-        linewidth=0.5,
-        color="red",
+def get_rts(metatlas_dataset):
+    """Returns RT values in DataFrame format"""
+    rts_df = dp.make_output_dataframe(
+        input_dataset=metatlas_dataset,
+        fieldname="rt_peak",
+        use_labels=True,
+        summarize=True,
+    )
+    rts_df["atlas RT peak"] = [
+        compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
+    ]
+    return rts_df
+
+
+def plot_compound_atlas_rts(num_files, rts_df, file_name):
+    """
+    Writes plot of RT peak for vs file for each compound
+    inputs:
+        num_files: number of files in data set, ie len(metatlas_dataset)
+        rts_df: Dataframe with RTs values
+        filename: where to save plot
+    """
+    # pylint: disable=too-many-locals
+    # number of columns in rts_df that are not values from a specific input file
+    num_not_files = len(rts_df.columns) - num_files
+    rts_df_plot = (
+        rts_df.sort_values(by="standard deviation", ascending=False, na_position="last")
+        .drop(["#NaNs"], axis=1)
+        .dropna(axis=0, how="all", subset=rts_df.columns[:-num_not_files])
     )
-    ax.plot(
-        np.linspace(0, max(x), 100),
-        (coef_poly[1] * np.linspace(0, max(x), 100))
-        + (coef_poly[2] * (np.linspace(0, max(x), 100) ** 2))
-        + intercept_poly,
-        linewidth=0.5,
-        color="green",
+    fontsize = 2
+    pad = 0.1
+    cols = 8
+    rows = int(math.ceil((rts_df.shape[0] + 1) / 8))
+    fig = plt.figure()
+    grid = gridspec.GridSpec(rows, cols, figure=fig, wspace=0.2, hspace=0.4)
+    for i, (_, row) in enumerate(rts_df_plot.iterrows()):
+        a_x = fig.add_subplot(grid[i])
+        a_x.tick_params(direction="in", length=1, pad=pad, width=0.1, labelsize=fontsize)
+        a_x.scatter(range(rts_df_plot.shape[1] - num_not_files), row[:-num_not_files], s=0.2)
+        ticks_loc = np.arange(0, len(rts_df_plot.columns) - num_not_files, 1.0)
+        a_x.a_xhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
+        a_x.set_xlim(-0.5, len(rts_df_plot.columns) - num_not_files + 0.5)
+        a_x.xa_xis.set_major_locator(mticker.FixedLocator(ticks_loc))
+        range_columns = list(rts_df_plot.columns[:-num_not_files]) + ["atlas RT peak"]
+        a_x.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanma_x(row.loc[range_columns]) + 0.12)
+        _ = [s.set_linewidth(0.1) for s in a_x.spines.values()]
+        # truncate name so it fits above a single subplot
+        a_x.set_title(row.name[:33], pad=pad, fontsize=fontsize)
+        a_x.set_xlabel("Files", labelpad=pad, fontsize=fontsize)
+        a_x.set_ylabel("Actual RTs", labelpad=pad, fontsize=fontsize)
+    plt.savefig(file_name, bbox_inches="tight")
+
+
+def generate_models(actual_df, pred_df):
+    """
+    inputs:
+        actual_df: dataframe with experimental RTs
+        pred_df: dataframe with predicted RTs
+    returns tuple containing two Model classes of order 1 and 2
+    """
+    ransac = RANSACRegressor(random_state=42)
+    rt_model_linear = ransac.fit(pred_df, actual_df)
+    linear = Model(
+        rt_model_linear, rt_model_linear.estimator_.intercept_, rt_model_linear.estimator_.coef_[0]
     )
-    ax.set_title("File: " + str(i))
-    ax.set_xlabel("predicted RTs")
-    ax.set_ylabel("actual RTs")
-
-fig_legend = "FileIndex       FileName"
-for i in range(rts_df.shape[1] - 5):
-    fig_legend = fig_legend + "\n" + str(i) + "        " + rts_df.columns[i]
-
-fig.tight_layout(pad=0.5)
-plt.text(0, -0.03 * rts_df.shape[1], fig_legend, transform=plt.gcf().transFigure)
-plt.savefig(os.path.join(output_data_qc, "Actual_vs_Predicted_RTs.pdf"), bbox_inches="tight")
-
-qc_df = rts_df[[rts_df.columns[selected_column]]]
-qc_df = qc_df.copy()
-print("Linear Parameters :", coef_linear, intercept_linear)
-print("Polynomial Parameters :", coef_poly, intercept_poly)
-
-qc_df.columns = ["RT Measured"]
-atlas_df.index = qc_df.index
-qc_df["RT Reference"] = atlas_df["rt_peak"]
-qc_df["RT Linear Pred"] = qc_df["RT Reference"].apply(lambda rt: coef_linear * rt + intercept_linear)
-qc_df["RT Polynomial Pred"] = qc_df["RT Reference"].apply(
-    lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
-)
-qc_df["RT Diff Linear"] = qc_df["RT Measured"] - qc_df["RT Linear Pred"]
-qc_df["RT Diff Polynomial"] = qc_df["RT Measured"] - qc_df["RT Polynomial Pred"]
-qc_df.to_csv(os.path.join(output_data_qc, "RT_Predicted_Model_Comparison.csv"))
-
-qc_df
-
-# CHOOSE YOUR MODEL HERE (linear / polynomial).
-# model = 'linear'
-model = "polynomial"
-
-# Save model
-
-with open(os.path.join(output_data_qc, "rt_model.txt"), "w") as f:
-    if model == "linear":
-        f.write(
-            "coef = {}\nintercept = {}\nqc_actual_rts = {}\nqc_predicted_rts = {}".format(
-                coef_linear, intercept_linear, ", ".join([g.name for g in groups]), myAtlas.name
-            )
-        )
-        f.write("\n" + repr(rt_model_linear.set_params()))
-
-    else:
-        f.write(
-            "coef = {}\nintercept = {}\nqc_actual_rts = {}\nqc_predicted_rts = {}".format(
-                coef_poly, intercept_poly, ", ".join([g.name for g in groups]), myAtlas.name
-            )
-        )
-        f.write("\n" + repr(rt_model_poly.set_params()))
-
-
-pos_atlas_indices = [0, 1, 2, 3, 4]
-neg_atlas_indices = [0, 1, 2, 3, 4]
-free_text = ""  # this will be appended to the end of the csv filename exported
-save_to_db = False
-
-for ix in pos_atlas_indices:
-    atlases = metob.retrieve("Atlas", name=pos_templates[ix], username="vrsingan")
-    prd_atlas_name = pos_templates[ix].replace("TPL", "PRD")
-    if free_text != "":
-        prd_atlas_name = prd_atlas_name + "_" + free_text
-    prd_atlas_filename = prd_atlas_name + ".csv"
-    myAtlas = atlases[-1]
-    PRD_atlas_df = ma_data.make_atlas_df(myAtlas)
-    PRD_atlas_df["label"] = [cid.name for cid in myAtlas.compound_identifications]
-    if model == "linear":
-        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
-            lambda rt: coef_linear * rt + intercept_linear
-        )
-    else:
-        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
-            lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
-        )
-    PRD_atlas_df["rt_min"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
-    PRD_atlas_df["rt_max"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
-
-    PRD_atlas_df.to_csv(os.path.join(output_data_qc, prd_atlas_filename), index=False)
-
-    if save_to_db:
-        dp.make_atlas_from_spreadsheet(
-            PRD_atlas_df,
-            prd_atlas_name,
-            filetype="dataframe",
-            sheetname="",
-            polarity="positive",
-            store=True,
-            mz_tolerance=12,
-        )
-    print(prd_atlas_name + " Created!")
-
-for ix in neg_atlas_indices:
-    atlases = metob.retrieve("Atlas", name=neg_templates[ix], username="vrsingan")
-    prd_atlas_name = neg_templates[ix].replace("TPL", "PRD")
-    if free_text != "":
-        prd_atlas_name = prd_atlas_name + "_" + free_text
-    prd_atlas_filename = prd_atlas_name + ".csv"
-    myAtlas = atlases[-1]
-    PRD_atlas_df = ma_data.make_atlas_df(myAtlas)
-    PRD_atlas_df["label"] = [cid.name for cid in myAtlas.compound_identifications]
-    if model == "linear":
-        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
-            lambda rt: coef_linear * rt + intercept_linear
-        )
-    else:
-        PRD_atlas_df["rt_peak"] = PRD_atlas_df["rt_peak"].apply(
-            lambda rt: (coef_poly[1] * rt) + (coef_poly[2] * (rt ** 2)) + intercept_poly
-        )
-    PRD_atlas_df["rt_min"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
-    PRD_atlas_df["rt_max"] = PRD_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
-
-    PRD_atlas_df.to_csv(os.path.join(output_data_qc, prd_atlas_filename), index=False)
-
-    if save_to_db:
-        dp.make_atlas_from_spreadsheet(
-            PRD_atlas_df,
-            prd_atlas_name,
-            filetype="dataframe",
-            sheetname="",
-            polarity="negative",
-            store=True,
-            mz_tolerance=12,
-        )
-
-    print(prd_atlas_name + " Created!")
+
+    poly_reg = PolynomialFeatures(degree=2)
+    x_poly = poly_reg.fit_transform(pred_df)
+    rt_model_poly = LinearRegression().fit(x_poly, actual_df)
+    poly = Model(rt_model_poly, rt_model_poly.intercept_, rt_model_poly.coef_)
+    return linear, poly
+
+
+def actual_and_predicted_df(selected_column, rts_df, atlas_df):
+    """
+    inputs:
+        selected_column: column number in rts_df to use for actual values
+        rts_df: dataframe of RT values
+        atlas_df: QC atlas in dataframe format
+    return a tuple of (actual_df, pred_df)
+    """
+    actual_df = rts_df.loc[:, rts_df.columns[selected_column]]
+    bad_qc_compounds = np.where(~np.isnan(actual_df))
+    actual_df = actual_df.iloc[bad_qc_compounds]
+    pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
+    return actual_df, pred_df
+
+
+def actual_and_predicted_rts(rts_df, atlas_df, actual_df, pred_df):
+    """
+    inputs:
+        rts_df: dataframe of RT values
+        atlas_df: QC atlas in dataframe format
+        acutal_df: dataframe of actual RT values
+        pred_df: dataframe of predicted RT values
+    return a tuple of lists of lists: (actual_rts, pred_rts)
+    """
+    actual_rts = [actual_df.values.tolist()]
+    pred_rts = [pred_df.values.tolist()]
+    for i in range(rts_df.shape[1] - 5):
+        current_actual_df = rts_df.loc[:, rts_df.columns[i]]
+        bad_qc_compounds = np.where(~np.isnan(current_actual_df))
+        current_actual_df = current_actual_df.iloc[bad_qc_compounds]
+        current_pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
+        actual_rts.append(current_actual_df.values.tolist())
+        pred_rts.append(current_pred_df.values.tolist())
+    return actual_rts, pred_rts
+
+
+def plot_actual_vs_pred_rts(pred_rts, actual_rts, rts_df, file_name, linear, poly):
+    """Write scatter plot showing linear vs polynomial fit"""
+    # pylint: disable=too-many-locals
+    rows = int(math.ceil((rts_df.shape[1] + 1) / 5))
+    cols = 5
+    fig = plt.figure(constrained_layout=False)
+    grid = gridspec.GridSpec(rows, cols, figure=fig)
+    plt.rc("font", size=6)
+    plt.rc("axes", labelsize=6)
+    plt.rc("xtick", labelsize=3)
+    plt.rc("ytick", labelsize=3)
+    for i in range(rts_df.shape[1] - 5):
+        sub = fig.add_subplot(grid[i])
+        x_values = list(itertools.chain(*pred_rts[i]))
+        y_values = actual_rts[i]
+        sub.scatter(x_values, y_values, s=2)
+        spaced_x = np.linspace(0, max(x_values), 100)
+        sub.plot(spaced_x, linear.predict(spaced_x), linewidth=0.5, color="red")
+        sub.plot(spaced_x, poly.predict(spaced_x), linewidth=0.5, color="green")
+        sub.set_title("File: " + str(i))
+        sub.set_xlabel("predicted RTs")
+        sub.set_ylabel("actual RTs")
+    fig_legend = "FileIndex       FileName"
+    for i in range(rts_df.shape[1] - 5):
+        fig_legend = fig_legend + "\n" + str(i) + "        " + rts_df.columns[i]
+    fig.tight_layout(pad=0.5)
+    plt.text(0, -0.03 * rts_df.shape[1], fig_legend, transform=plt.gcf().transFigure)
+    plt.savefig(file_name, bbox_inches="tight")
+
+
+def save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, file_name):
+    """
+    Save csv format file with per-compound comparision of linear vs polynomial models
+    inputs:
+        selected_column: column number in rts_df to use for actual values
+        qc_atlas_df: QC atlas in dataframe format
+        rts_df: dataframe with RT values
+        linear: instance of class Model with first order model
+        poly: instance of class Model with second order model
+        filename: where to save the plot
+    """
+    qc_df = rts_df[[rts_df.columns[selected_column]]].copy()
+    qc_df.columns = ["RT Measured"]
+    qc_df["RT Reference"] = qc_atlas_df["rt_peak"]
+    qc_df["RT Linear Pred"] = linear.predict(qc_df["RT Reference"])
+    qc_df["RT Polynomial Pred"] = poly.predict(qc_df["RT Reference"])
+    qc_df["RT Diff Linear"] = qc_df["RT Measured"] - qc_df["RT Linear Pred"]
+    qc_df["RT Diff Polynomial"] = qc_df["RT Measured"] - qc_df["RT Polynomial Pred"]
+    qc_df.to_csv(file_name)
+
+
+def write_models(file_name, linear_model, poly_model, groups, atlas):
+    """
+    inputs:
+        filename: text file to save model information
+        linear_model: instance of class Model with first order model
+        poly_model: instance of class Model with second order model
+        groups: list of groups used in model generation
+        atlas: QC atlas
+    """
+    with open(file_name, "w") as out_fh:
+        for model in [linear_model, poly_model]:
+            out_fh.write(f"{model.sk_model.set_params()}\n")
+            out_fh.write(f"{model}\n")
+            group_names = ", ".join([g.name for g in groups])
+            out_fh.write(f"groups = {group_names}\n")
+            out_fh.write(f"atlas = {atlas.name}\n\n")
+
+
+def create_adjusted_atlases(linear, poly, qc_dir, atlas_indices=None, free_text="", save_to_db=True):
+    """
+    input:
+        linear_model: instance of class Model with first order model
+        poly_model: instance of class Model with second order model
+        qc_dir: directory to write csv files to
+        atlas_indices: list of integers for which adjusted atlases to create
+                        0: EMA_Unlab
+                        1: QCv3_Unlab
+                        2: ISv5_Unlab
+                        3: ISv5_13C15N
+                        4: IS_LabUnlab2
+        free_text: arbitrary string to append to atlas name
+        save_to_db: if True, save the atlases to the database
+    """
+    if atlas_indices is None:
+        atlas_indices = [0, 4]
+    for polarity in ["positive", "negative"]:
+        for idx in atlas_indices:
+            for model in [linear, poly]:
+                template_name = TEMPLATES[polarity][idx]
+                atlas = metob.retrieve("Atlas", name=template_name, username="vrsingan")[-1]
+                prd_atlas_name = template_name.replace("TPL", "PRD") + f"_{model.name}"
+                if free_text != "":
+                    prd_atlas_name = prd_atlas_name + "_" + free_text
+                prd_atlas_filename = prd_atlas_name + ".csv"
+                prd_atlas_df = ma_data.make_atlas_df(atlas)
+                prd_atlas_df["label"] = [cid.name for cid in atlas.compound_identifications]
+                prd_atlas_df["rt_peak"] = model.predict(prd_atlas_df["rt_peak"])
+                prd_atlas_df["rt_min"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
+                prd_atlas_df["rt_max"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
+                prd_atlas_df.to_csv(os.path.join(qc_dir, prd_atlas_filename), index=False)
+                if save_to_db:
+                    dp.make_atlas_from_spreadsheet(
+                        prd_atlas_df,
+                        prd_atlas_name,
+                        filetype="dataframe",
+                        sheetname="",
+                        polarity=polarity,
+                        store=True,
+                        mz_tolerance=12,
+                    )
+                print(prd_atlas_name + " Created!")
diff --git a/noxfile.py b/noxfile.py
index 15b46ec0..537406a9 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -23,8 +23,10 @@
     "metatlas/io/targeted_output.py",
     "metatlas/io/write_utils.py",
     "metatlas/datastructures/metatlas_dataset.py",
+    "metatlas/tools/environment.py",
     "metatlas/tools/logging.py",
     "metatlas/tools/notebook.py",
+    "metatlas/tools/predict_rt.py",
     "tests",
 ]
 
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index c72e8c57..76076708 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -62,10 +62,12 @@ def test_targeted_by_line01_with_remove(tmp_path):
         ],
         check=True,
     )
+    files = subprocess.check_output(f"find {str(tmp_path)} -type f", shell=True, text=True).strip()
+    print(files)
     num_files_created = int(
         subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
     )
-    assert num_files_created == 38
+    assert num_files_created == 39
     with open(out_file, "r") as handle:
         for num, line in enumerate(handle.readlines()):
             clean_line = line.rstrip("\n")
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 91caf508..fedaa63f 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -5,9 +5,12 @@
 # pylint: disable=missing-function-docstring,unused-argument,line-too-long,too-many-lines,too-many-arguments
 
 import getpass
+import logging
 import os
 import sqlite3
 
+from importlib import reload
+
 import pytest
 import numpy as np
 import pandas as pd
@@ -16,6 +19,10 @@
 
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.datastructures import object_helpers as metoh
+
+
+logger = logging.getLogger(__name__)
 
 
 @pytest.fixture(name="username", scope="session")
@@ -24,57 +31,67 @@ def fixture_username():
 
 
 @pytest.fixture(name="analysis_ids")
-def fixture_analysis_ids(tmp_path, sqlite_with_atlas, username):
+def fixture_analysis_ids(sqlite_with_atlas, username):
     return mads.AnalysisIdentifiers(
         f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
         "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
         "FinalEMA-HILIC",
         "positive",
         0,
-        str(tmp_path),
+        str(os.getcwd()),
     )
 
 
 @pytest.fixture(name="analysis_ids_with_2_cids")
-def fixture_analysis_ids_with_2_cids(tmp_path, sqlite_with_atlas_with_2_cids, username):
+def fixture_analysis_ids_with_2_cids(sqlite_with_atlas_with_2_cids, username):
     return mads.AnalysisIdentifiers(
         f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1",
         "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
         "FinalEMA-HILIC",
         "positive",
         0,
-        str(tmp_path),
+        str(os.getcwd()),
     )
 
 
 @pytest.fixture(name="sqlite")
-def fixture_sqlite(username):
+def fixture_sqlite(username, change_test_dir, atlas):
+    logging.debug("creating database file in %s", os.getcwd())
+    assert not os.path.exists(f"{username}_workspace.db")
     sqlite3.connect(f"{username}_workspace.db").close()
+    logger.debug("reloading metoh")
+    reload(metoh)
+    logger.debug("Storing empty objects to create tables")
     metob.store(metob.Atlas())
     metob.store(metob.CompoundIdentification())
     metob.store(metob.Compound())
     metob.store(metob.MzReference())
     metob.store(metob.RtReference())
     metob.store(metob.LcmsRun())
+    logger.debug("Done storing empty objects to create tables")
     yield
+    metob.workspace.close_connection()
     close_all_sessions()
 
 
 @pytest.fixture(name="sqlite_with_atlas")
 def fixture_sqlite_with_atlas(sqlite, atlas, username):
     atlas.name = f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0"
+    logger.debug("Saving atlas %s", atlas.name)
     metob.store(atlas)
 
 
 @pytest.fixture(name="sqlite_with_atlas_with_2_cids")
 def fixture_sqlite_with_atlas_with_2_cids(sqlite, atlas_with_2_cids, username):
     atlas_with_2_cids.name = f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1"
+    logger.debug("Saving atlas %s", atlas_with_2_cids.name)
     metob.store(atlas_with_2_cids)
 
 
 @pytest.fixture(name="change_test_dir", scope="function", autouse=True)
 def fixture_change_test_dir(request, tmp_path):
     os.chdir(tmp_path)
+    logger.debug("changing dir to %s", tmp_path)
     yield
     os.chdir(request.config.invocation_dir)
 
@@ -526,7 +543,7 @@ def fixture_metatlas_dataset(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab)
+    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab, save_metadata=False)
 
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
@@ -542,7 +559,7 @@ def fixture_metatlas_dataset_with_2_cids(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids_with_2_cids, groups_controlled_vocab)
+    return mads.MetatlasDataset(analysis_ids_with_2_cids, groups_controlled_vocab, save_metadata=False)
 
 
 @pytest.fixture(name="eic")
diff --git a/tests/unit/test_environment.py b/tests/unit/test_environment.py
new file mode 100644
index 00000000..1b7d25c2
--- /dev/null
+++ b/tests/unit/test_environment.py
@@ -0,0 +1,71 @@
+"""Test of environment setup functions"""
+# pylint: disable=missing-function-docstring
+
+import json
+
+from metatlas.tools import environment
+
+
+def test_create_notebook_with_parameters01():
+    orig_data = {
+        "cells": [
+            None,
+            {
+                "source": [
+                    "# this is a comment\n",
+                    "param1 = 0\n",
+                    "\n",
+                    "param2 = []\n",
+                    'param3 = "REPLACE ME"\n',
+                ]
+            },
+        ]
+    }
+    with open("test.json", "w") as out_fh:
+        json.dump(orig_data, out_fh)
+    environment.create_notebook_with_parameters(
+        "test.json", "out.json", {"param1": 1, "param2": ["foo", "bar"], "param3": "My_Exp_Name"}
+    )
+    with open("out.json") as in_fh:
+        data = json.load(in_fh)
+    assert data["cells"][1]["source"][1] == "param1 = 1\n"
+    assert data["cells"][1]["source"][3] == "param2 = ['foo', 'bar']\n"
+    assert data["cells"][1]["source"][4] == "param3 = 'My_Exp_Name'\n"
+
+
+def test_create_notebook_with_parameters02():
+    orig_data = {
+        "cells": [
+            None,
+            {
+                "source": [
+                    "# this is a comment\n",
+                    "param1 = 0\n",
+                    "\n",
+                    "param2 = []\n",
+                    'param3 = "REPLACE ME"\n',
+                ]
+            },
+        ]
+    }
+    with open("test.json", "w") as out_fh:
+        json.dump(orig_data, out_fh)
+    environment.create_notebook_with_parameters("test.json", "out.json", {})
+    with open("out.json") as in_fh:
+        data = json.load(in_fh)
+    assert data["cells"][1]["source"][1] == "param1 = 0\n"
+    assert data["cells"][1]["source"][3] == "param2 = []\n"
+    assert data["cells"][1]["source"][4] == 'param3 = "REPLACE ME"\n'
+
+
+def test_create_notebook_with_parameters03():
+    orig_data = {
+        "cells": [None, {"source": ["# this is a comment\n", "param1 = True\n", "\n", "param2 = None\n"]}]
+    }
+    with open("test.json", "w") as out_fh:
+        json.dump(orig_data, out_fh)
+    environment.create_notebook_with_parameters("test.json", "out.json", {"param1": None, "param2": True})
+    with open("out.json") as in_fh:
+        data = json.load(in_fh)
+    assert data["cells"][1]["source"][1] == "param1 = None\n"
+    assert data["cells"][1]["source"][3] == "param2 = True\n"
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 3b7c83c1..4f131c16 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -373,6 +373,20 @@ def test_store_atlas04(metatlas_dataset, sqlite, username):
         metatlas_dataset.store_atlas()
 
 
+def test_store_atlas05(atlas, sqlite, username):
+    atlas.name = "test atlas"
+    metob.store(atlas)
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
+def test_store_atlas06(atlas, sqlite_with_atlas, username):
+    atlas.name = "test atlas 06"
+    metob.store(atlas)
+    second = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(second) == 1
+
+
 def test_analysis_identifiers01(sqlite):
     with pytest.raises(ValueError, match=r"Database does not contain an atlas.*"):
         mads.AnalysisIdentifiers(
@@ -510,10 +524,8 @@ def test_get_atlas03(mocker, analysis_ids, caplog, username):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0, 0])
     with pytest.raises(ValueError):
         mads.MetatlasDataset(analysis_ids)
-    assert (
-        f"2 atlases with name 505892_OakGall_final_FinalEMA-HILIC_POS_{username}0 and owned by {username} already exist."
-        in caplog.text
-    )
+    atlas = f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
+    assert f"2 atlases with name {atlas} and owned by {username} already exist." in caplog.text
 
 
 def test_get_atlas04(metatlas_dataset, username):

From 5c3c224c4c22ec9acae0ad2244d4e52d45932354 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 29 Jun 2021 09:54:30 -0700
Subject: [PATCH 032/177] WIP - add debug logging to metatlas_objects.py

---
 metatlas/datastructures/metatlas_objects.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/metatlas/datastructures/metatlas_objects.py b/metatlas/datastructures/metatlas_objects.py
index de48bda1..590e7520 100644
--- a/metatlas/datastructures/metatlas_objects.py
+++ b/metatlas/datastructures/metatlas_objects.py
@@ -1,10 +1,12 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import getpass
-import uuid
-import time
+import logging
 import os
 import pprint
+import time
+import uuid
+
 from pwd import getpwuid
 from tabulate import tabulate
 import pandas as pd
@@ -16,6 +18,8 @@
 )
 from six.moves import zip
 
+logger = logging.getLogger(__name__)
+
 #Making a new table means adding a new class to metatlas_objects.py.
 #Floats are set as single precision by default, unfortunately, so here is the best way to create a table containing floats:
 #Create a new table
@@ -122,6 +126,7 @@ class MetatlasObject(HasTraits):
 
     def __init__(self, **kwargs):
         """Set the default attributes."""
+        logger.debug('Creating new instance of %s with parameters %s', self.__class__.__name__, kwargs)
         kwargs.setdefault('unique_id', uuid.uuid4().hex)
         kwargs.setdefault('head_id', kwargs['unique_id'])
         kwargs.setdefault('username', getpass.getuser())
@@ -169,6 +174,7 @@ def clone(self, recursive=False):
         obj: MetatlasObject
             Cloned object.
         """
+        logger.debug('Cloning instance of %s with recursive=', self.__class__.__name__, recursive)
         obj = self.__class__()
         for (tname, trait) in self.traits().items():
             if tname.startswith('_') or trait.metadata.get('readonly', False):
@@ -244,6 +250,7 @@ def __getattribute__(self, name):
         """Automatically resolve stubs on demand.
         """
 #         value = super(MetatlasObject, self).__getattribute__(name)
+        logger.debug('Automatically resolving stub via %s.__getattribute__(%s)', self.__class__.__name__, name)
         value = super().__getattribute__(name)
 
         if isinstance(value, Stub) and FETCH_STUBS:

From a0602ff92db552bfde7ad9aa8a34a1c4a58fd28a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 29 Jun 2021 09:55:04 -0700
Subject: [PATCH 033/177] WIP - add tests around resolving stubs to classes

---
 tests/unit/test_metatlas_dataset.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 4f131c16..587301a5 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -588,3 +588,21 @@ def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
 
 def test_short_polarity_inverse01(analysis_ids):
     assert set(analysis_ids.short_polarity_inverse) == {"NEG", "FPS"}
+
+
+def test_access_data_compound_name(metatlas_dataset):
+    assert metatlas_dataset.data[0][0]["identification"].name == "2'-deoxyadenosine"
+
+
+def test_cid_type01(atlas):
+    assert isinstance(atlas.compound_identifications[0], metob.CompoundIdentification)
+
+
+def test_load_atlas01(atlas, sqlite_with_atlas, username):
+    atlases = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert isinstance(atlases[0].compound_identifications[0], metob.CompoundIdentification)
+
+
+def test_load_atlas02(atlas, sqlite_with_atlas, username):
+    results = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert isinstance(results[0].compound_identifications[0], metob.CompoundIdentification)

From 317bfcee178ee604bae37839ab4a1dada907467b Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 1 Jul 2021 17:11:15 -0700
Subject: [PATCH 034/177] WIP - remove reload(). All tests passing.

Uses db connection pooling with transactions.
clean up logging.
---
 metatlas/datastructures/metatlas_dataset.py     |  2 +-
 metatlas/datastructures/metatlas_objects.py     | 15 +++++++--------
 metatlas/datastructures/object_helpers.py       | 17 ++++++++++++-----
 metatlas/io/metatlas_get_data_helper_fun.py     |  4 ----
 tests/unit/conftest.py                          | 11 +++++------
 tests/unit/test_metatlas_dataset.py             |  9 +++++++--
 tests/unit/test_metatlas_get_data_helper_fun.py |  2 +-
 7 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 5285cccb..abbbad05 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -923,4 +923,4 @@ def parallel_process(function, data, max_cpus, unit=None):
     if max_cpus > 1 and len(data) > 1:
         with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
             return list(tqdm.tqdm(pool.imap(function, data), length=len(data), **kwargs))
-    return [function(i) for i in tqdm.tqdm(data, **kwargs)]
+    return [function(i) for i in data]
diff --git a/metatlas/datastructures/metatlas_objects.py b/metatlas/datastructures/metatlas_objects.py
index 590e7520..64397e76 100644
--- a/metatlas/datastructures/metatlas_objects.py
+++ b/metatlas/datastructures/metatlas_objects.py
@@ -63,7 +63,7 @@ def retrieve(object_type, **kwargs):
       List of Metatlas Objects meeting the criteria.  Will return the
       latest version of each object.
     """
-    return workspace.retrieve(object_type, **kwargs)
+    return Workspace.get_instance().retrieve(object_type, **kwargs)
 
 
 def remove(object_type, **kwargs):
@@ -81,7 +81,7 @@ def remove(object_type, **kwargs):
     if not isinstance(object_type, str):
         print('remove() expects a string argument, use remove_objects() to'
               'delete actual objects.')
-    return workspace.remove(object_type, **kwargs)
+    return Workspace.get_instance().remove(object_type, **kwargs)
 
 
 def remove_objects(objects, all_versions=True, **kwargs):
@@ -96,7 +96,7 @@ def remove_objects(objects, all_versions=True, **kwargs):
     if isinstance(objects, str):
         print('remove_objects() expects actual objects, use remove() to'
               'remove objects by type.')
-    return workspace.remove_objects(objects, all_versions, **kwargs)
+    return Workspace.get_instance().remove_objects(objects, all_versions, **kwargs)
 
 
 def store(objects, **kwargs):
@@ -107,7 +107,7 @@ def store(objects, **kwargs):
     objects: Metatlas object or list of Metatlas Objects
         Object(s) to store in the database.
     """
-    workspace.save_objects(objects, **kwargs)
+    Workspace.get_instance().save_objects(objects, **kwargs)
 
 
 @set_docstring
@@ -174,7 +174,7 @@ def clone(self, recursive=False):
         obj: MetatlasObject
             Cloned object.
         """
-        logger.debug('Cloning instance of %s with recursive=', self.__class__.__name__, recursive)
+        logger.debug('Cloning instance of %s with recursive=%s', self.__class__.__name__, recursive)
         obj = self.__class__()
         for (tname, trait) in self.traits().items():
             if tname.startswith('_') or trait.metadata.get('readonly', False):
@@ -250,7 +250,6 @@ def __getattribute__(self, name):
         """Automatically resolve stubs on demand.
         """
 #         value = super(MetatlasObject, self).__getattribute__(name)
-        logger.debug('Automatically resolving stub via %s.__getattribute__(%s)', self.__class__.__name__, name)
         value = super().__getattribute__(name)
 
         if isinstance(value, Stub) and FETCH_STUBS:
@@ -486,7 +485,7 @@ def validate(self, obj, value):
         elif isinstance(value, str):
             if value.upper() in ID_GRADES:
                 return ID_GRADES[value.upper()]
-            objects = workspace.retrieve('identificationgrade', name=value.upper())
+            objects = Workspace.get_instance().retrieve('identificationgrade', name=value.upper())
             if objects:
                 ID_GRADES[value.upper()] = objects[-1]
                 return objects[-1]
@@ -662,7 +661,7 @@ def find_invalid_runs(**kwargs):
 # Singleton Workspace object
 # Must be instantiated after all of the Metatlas Objects
 # are defined so we can get all of the subclasses.
-workspace = Workspace()
+# workspace = Workspace()
 
 
 def to_dataframe(objects):
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index f1fa81ce..9f8b4ff4 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -115,6 +115,7 @@ def _get_subclasses(cls):
                                    for g in _get_subclasses(s)]
 
 class Workspace(object):
+    instance = None
 
     def __init__(self):
         # get metatlas directory since notebooks and scripts could be launched
@@ -146,7 +147,7 @@ def __init__(self):
                 self.path = f"sqlite:///{filename}"
                 if os.path.exists(filename):
                     os.chmod(filename, 0o775)
-        logging.warning('Using database at: %s', self.path)
+        logging.debug('Using database at: %s', self.path)
 
         self.tablename_lut = dict()
         self.subclass_lut = dict()
@@ -164,6 +165,12 @@ def __init__(self):
         self.seen = dict()
         Workspace.instance = self
 
+    @classmethod
+    def get_instance(cls):
+        if Workspace.instance is None:
+            return Workspace()
+        return Workspace.instance
+
     def get_connection(self):
         """
         Get a re-useable connection to the database.
@@ -195,7 +202,7 @@ def convert_to_double(self, table, entry):
 
     def save_objects(self, objects, _override=False):
         """Save objects to the database"""
-        logging.warning('Entering Workspace.save_objects')
+        logging.debug('Entering Workspace.save_objects')
         if not isinstance(objects, (list, set)):
             objects = [objects]
         self._seen = dict()
@@ -204,7 +211,7 @@ def save_objects(self, objects, _override=False):
         self._inserts = defaultdict(list)
         for obj in objects:
             self._get_save_data(obj, _override)
-        logging.warning('Workspace._inserts=%s', self._inserts)
+        logging.debug('Workspace._inserts=%s', self._inserts)
         self.get_connection()
         self.db.begin()
         try:
@@ -230,7 +237,7 @@ def save_objects(self, objects, _override=False):
                     if 'sqlite' not in self.path:
                         self.fix_table(table_name)
                 self.db[table_name].insert_many(inserts)
-                logging.warning('inserting %s', inserts)
+                logging.debug('inserting %s', inserts)
             self.db.commit()
         except Exception:
             self.db.rollback()
@@ -580,7 +587,7 @@ def __repr__(self):
                           self.unique_id)
 
     def __str__(self):
-        return self.unique_id
+        return str(self.unique_id)
 
 
 class MetInstance(Instance):
diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 2009a62d..bd311714 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -176,15 +176,11 @@ def remove_ms1_data_not_in_atlas(atlas_df, data):
         has_current_polarity = atlas_df.detected_polarity == polarity
         if any(has_current_polarity):
             atlas_mz = atlas_df[has_current_polarity].mz.copy().sort_values().values
-            logger.debug("atlas_mz=%s", atlas_mz)
             max_mz_tolerance = atlas_df[has_current_polarity].mz_tolerance.max()
-            logger.debug("atlas_mz=%s, max_mz_tolerance=%.6f", atlas_mz, max_mz_tolerance)
             if data[name].shape[0] > 1:
                 original_mz = data[name].mz.values
                 nearest_mz = fast_nearest_interp(original_mz, atlas_mz, atlas_mz)
-                logger.debug("nearest_mz=%s", nearest_mz)
                 data[name]['ppm_difference'] = abs(original_mz - nearest_mz) / original_mz * 1e6
-                logger.debug("ppm_difference=%s", data[name]['ppm_difference'])
                 query_str = 'ppm_difference < %f' % max_mz_tolerance
                 data[name] = data[name].query(query_str)
     return data
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index fedaa63f..0fb6251c 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -8,8 +8,7 @@
 import logging
 import os
 import sqlite3
-
-from importlib import reload
+import threading
 
 import pytest
 import numpy as np
@@ -59,8 +58,6 @@ def fixture_sqlite(username, change_test_dir, atlas):
     logging.debug("creating database file in %s", os.getcwd())
     assert not os.path.exists(f"{username}_workspace.db")
     sqlite3.connect(f"{username}_workspace.db").close()
-    logger.debug("reloading metoh")
-    reload(metoh)
     logger.debug("Storing empty objects to create tables")
     metob.store(metob.Atlas())
     metob.store(metob.CompoundIdentification())
@@ -70,8 +67,8 @@ def fixture_sqlite(username, change_test_dir, atlas):
     metob.store(metob.LcmsRun())
     logger.debug("Done storing empty objects to create tables")
     yield
-    metob.workspace.close_connection()
-    close_all_sessions()
+    metoh.Workspace.get_instance().close_connection()
+    metoh.Workspace.instance = None
 
 
 @pytest.fixture(name="sqlite_with_atlas")
@@ -90,10 +87,12 @@ def fixture_sqlite_with_atlas_with_2_cids(sqlite, atlas_with_2_cids, username):
 
 @pytest.fixture(name="change_test_dir", scope="function", autouse=True)
 def fixture_change_test_dir(request, tmp_path):
+    logger.info("Incoming thread count %d", threading.active_count())
     os.chdir(tmp_path)
     logger.debug("changing dir to %s", tmp_path)
     yield
     os.chdir(request.config.invocation_dir)
+    logger.info("Outgoing thread count %d", threading.active_count())
 
 
 @pytest.fixture(name="ms1_pos")
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 587301a5..20fb1822 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -582,8 +582,8 @@ def test_annotation_gui01(metatlas_dataset, hits, mocker):
 def test_generate_all_outputs01(metatlas_dataset, hits, mocker):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.generate_all_outputs()
-    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 12
-    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 23
+    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*")) == 10
+    assert len(glob.glob(metatlas_dataset.ids.output_dir + "/*/*")) == 19
 
 
 def test_short_polarity_inverse01(analysis_ids):
@@ -606,3 +606,8 @@ def test_load_atlas01(atlas, sqlite_with_atlas, username):
 def test_load_atlas02(atlas, sqlite_with_atlas, username):
     results = metob.retrieve("Atlas", name=atlas.name, username=username)
     assert isinstance(results[0].compound_identifications[0], metob.CompoundIdentification)
+
+
+def test_load_atlas03(sqlite_with_atlas, atlas, username):
+    results = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert results[0].compound_identifications[0].rt_references[0].rt_peak == 2.1964640053707174
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index a2475d86..20d62778 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -22,7 +22,7 @@ def test_transfer_identification_data_to_atlas(metatlas_dataset, atlas):
     assert updated.identification_notes == out.compound_identifications[0].identification_notes
 
 
-def test_set_nested_term_attr(metatlas_dataset):
+def test_set_nested_term_attr01(metatlas_dataset):
     gdhf.set_nested(
         metatlas_dataset,
         [0, 0, "identification", "mz_references", 0, "adduct"],

From 13fe6d3428e15155553f77c1e78dd95fa5e4856a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 1 Jul 2021 21:58:28 -0700
Subject: [PATCH 035/177] Fix incorrect values in system test

Add additional assertions in system test
---
 metatlas/datastructures/metatlas_dataset.py |  1 +
 tests/system/test_targeted.py               | 31 ++++++++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index abbbad05..c9b8d81a 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -804,6 +804,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         )
         with tarfile.open(output_path, "w:gz") as tar:
             tar.add(self.ids.output_dir, arcname=os.path.basename(self.ids.output_dir))
+        logger.info("Generation of archive completed succesfully: %s", output_path)
 
 
 class MetatlasSample:
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index 76076708..e56828c5 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -7,8 +7,10 @@
 def test_targeted_by_line01_with_remove(tmp_path):
     image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    out_file = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
-    expected = [
+    out_files = {}
+    expected = {}
+    out_files['peak_height'] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
+    expected['peak_height'] = [
         f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
         f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
         "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
@@ -16,10 +18,22 @@ def test_targeted_by_line01_with_remove(tmp_path):
         "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
         "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
         "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
-        "0001_adenine_positive_M+H136p0618_2p52\t1880780.125\t12096485.0\t51774956.0\t91955488.0",
+        "0001_adenine_positive_M+H136p0618_2p52\t1594753.875\t12096485.0\t51774956.0\t91955488.0",
         "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
         "",
     ]
+    out_files['rt_peak'] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab"
+    expected['rt_peak'] = [
+        f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
+        f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
+        "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
+        "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
+        "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+        "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+        "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t2.2775044441223145\t2.2806363105773926\t2.2833268642425537\t2.2922415733337402",
+        "0001_adenine_positive_M+H136p0618_2p52\t2.6164748668670654\t2.639369249343872\t2.6182913780212402\t2.657374620437622",
+        "0002_adenosine_positive_M+H268p1041_3p02\t3.098848819732666\t3.1250929832458496\t3.1176068782806396\t3.139331817626953",
+    ]
     subprocess.run(
         [
             "docker",
@@ -67,8 +81,11 @@ def test_targeted_by_line01_with_remove(tmp_path):
     num_files_created = int(
         subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
     )
+    for metric_name, path in out_files.items():
+        os.system(f"cat {path}")
     assert num_files_created == 39
-    with open(out_file, "r") as handle:
-        for num, line in enumerate(handle.readlines()):
-            clean_line = line.rstrip("\n")
-            assert expected[num] == clean_line
+    for metric_name, path in out_files.items():
+        with open(path, "r") as handle:
+            for num, line in enumerate(handle.readlines()):
+                clean_line = line.rstrip("\n")
+                assert expected[metric_name][num] == clean_line

From 96c296c0168db52da1290c32e3bc5af163d2f190 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 1 Jul 2021 22:26:08 -0700
Subject: [PATCH 036/177] WIP - linting fixes

---
 tests/system/test_targeted.py | 12 +++++++-----
 tests/unit/conftest.py        |  2 --
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index e56828c5..d95f5199 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -9,8 +9,10 @@ def test_targeted_by_line01_with_remove(tmp_path):
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     out_files = {}
     expected = {}
-    out_files['peak_height'] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
-    expected['peak_height'] = [
+    out_files["peak_height"] = (
+        tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
+    )
+    expected["peak_height"] = [
         f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
         f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
         "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
@@ -22,8 +24,8 @@ def test_targeted_by_line01_with_remove(tmp_path):
         "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
         "",
     ]
-    out_files['rt_peak'] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab"
-    expected['rt_peak'] = [
+    out_files["rt_peak"] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab"
+    expected["rt_peak"] = [
         f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
         f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
         "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
@@ -81,7 +83,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
     num_files_created = int(
         subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
     )
-    for metric_name, path in out_files.items():
+    for _, path in out_files.items():
         os.system(f"cat {path}")
     assert num_files_created == 39
     for metric_name, path in out_files.items():
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 0fb6251c..802761a1 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pandas as pd
 
-from sqlalchemy.orm import close_all_sessions
-
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh

From 6639620b081733006ae3a53d7c9d1ce9d10cac8c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 2 Jul 2021 07:52:07 -0700
Subject: [PATCH 037/177] Increased logging in MetatlasDataset

---
 metatlas/datastructures/metatlas_dataset.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index c9b8d81a..58012197 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -61,6 +61,7 @@ def __init__(
 
     def validate(self):
         """Valid class inputs"""
+        logging.debug('Validating inputs to AnalysisIdentifiers')
         if self._source_atlas is not None:
             get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
@@ -73,6 +74,7 @@ def validate(self):
             raise TypeError("Parameter analysis_number is not an integer.")
         if self.analysis_number < 0:
             raise ValueError("Parameter analysis_number cannot be negative.")
+        logging.debug('Inputs to AnalysisIdentifiers passed validation.')
 
     @property
     def source_atlas(self):
@@ -178,6 +180,7 @@ def __init__(
             exclude_files: array of strings that will exclude files if they are substrings of the filename
             save_metadata: if True, write metadata files containing data sources and LCMS runs short name
         """
+        logging.debug('Creating new MetatlasDataset instance...')
         self.ids = ids
         self._atlas = None
         self._atlas_valid = False
@@ -202,6 +205,7 @@ def __init__(
         if ids.source_atlas is not None:
             self._get_atlas()
         if save_metadata:
+            logging.debug('Writing MetatlasDataset metadata files')
             self.write_data_source_files()
             self.write_lcmsruns_short_names()
         self.store_groups(exist_ok=True)
@@ -260,7 +264,9 @@ def _get_atlas(self):
                 logger.exception(err)
                 raise err
         else:
+            logging.info('Retriving source atlas: %s', self.ids.source_atlas)
             source = get_atlas(self.ids.source_atlas, self.ids.username)
+            logging.info('Cloning source atlas')
             self._atlas = source.clone()
             self._atlas.name = self.ids.atlas
             self._atlas_valid = True
@@ -397,6 +403,7 @@ def store_atlas(self, even_if_exists=False):
             Saves the altas to the database.
             Raises ValueError if even_if_exists==False and name is already in the database with your username
         """
+        start_time = datetime.datetime.now()
         name = self.atlas.name
         username = self.ids.username
         try:
@@ -406,7 +413,7 @@ def store_atlas(self, even_if_exists=False):
             logger.exception(err)
             raise err
         metob.store(self.atlas)
-        logger.info("Atlas %s stored in database with owner %s.", self.ids.atlas, self.ids.username)
+        logger.info("Atlas %s stored in database with owner %s in %s.", self.ids.atlas, self.ids.username, _duration_since(start_time))
 
     def export_atlas_to_csv(self, filename=None):
         """
@@ -752,6 +759,7 @@ def store_groups(self, exist_ok=False):
             except ValueError as err:
                 logger.exception(err)
                 raise err
+        logging.debug('Storing %d groups in the database', len(self.groups))
         metob.store(self.groups)
 
     def compound_idxs_not_evaluated(self):

From e6854d8c95da5b5caa69c7cd2a7197277aac656e Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 2 Jul 2021 11:45:13 -0700
Subject: [PATCH 038/177] add msms_refs fixture and test get_msms_hits

---
 metatlas/datastructures/metatlas_dataset.py |  21 +-
 metatlas/plots/dill2plots.py                |  40 +--
 tests/unit/conftest.py                      | 291 ++++++++++++++++++++
 tests/unit/test_dill2plot.py                |   7 +
 4 files changed, 334 insertions(+), 25 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 58012197..b3cab3b3 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -61,7 +61,7 @@ def __init__(
 
     def validate(self):
         """Valid class inputs"""
-        logging.debug('Validating inputs to AnalysisIdentifiers')
+        logging.debug("Validating inputs to AnalysisIdentifiers")
         if self._source_atlas is not None:
             get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
@@ -74,7 +74,7 @@ def validate(self):
             raise TypeError("Parameter analysis_number is not an integer.")
         if self.analysis_number < 0:
             raise ValueError("Parameter analysis_number cannot be negative.")
-        logging.debug('Inputs to AnalysisIdentifiers passed validation.')
+        logging.debug("Inputs to AnalysisIdentifiers passed validation.")
 
     @property
     def source_atlas(self):
@@ -180,7 +180,7 @@ def __init__(
             exclude_files: array of strings that will exclude files if they are substrings of the filename
             save_metadata: if True, write metadata files containing data sources and LCMS runs short name
         """
-        logging.debug('Creating new MetatlasDataset instance...')
+        logging.debug("Creating new MetatlasDataset instance...")
         self.ids = ids
         self._atlas = None
         self._atlas_valid = False
@@ -205,7 +205,7 @@ def __init__(
         if ids.source_atlas is not None:
             self._get_atlas()
         if save_metadata:
-            logging.debug('Writing MetatlasDataset metadata files')
+            logging.debug("Writing MetatlasDataset metadata files")
             self.write_data_source_files()
             self.write_lcmsruns_short_names()
         self.store_groups(exist_ok=True)
@@ -264,9 +264,9 @@ def _get_atlas(self):
                 logger.exception(err)
                 raise err
         else:
-            logging.info('Retriving source atlas: %s', self.ids.source_atlas)
+            logging.info("Retriving source atlas: %s", self.ids.source_atlas)
             source = get_atlas(self.ids.source_atlas, self.ids.username)
-            logging.info('Cloning source atlas')
+            logging.info("Cloning source atlas")
             self._atlas = source.clone()
             self._atlas.name = self.ids.atlas
             self._atlas_valid = True
@@ -413,7 +413,12 @@ def store_atlas(self, even_if_exists=False):
             logger.exception(err)
             raise err
         metob.store(self.atlas)
-        logger.info("Atlas %s stored in database with owner %s in %s.", self.ids.atlas, self.ids.username, _duration_since(start_time))
+        logger.info(
+            "Atlas %s stored in database with owner %s in %s.",
+            self.ids.atlas,
+            self.ids.username,
+            _duration_since(start_time),
+        )
 
     def export_atlas_to_csv(self, filename=None):
         """
@@ -759,7 +764,7 @@ def store_groups(self, exist_ok=False):
             except ValueError as err:
                 logger.exception(err)
                 raise err
-        logging.debug('Storing %d groups in the database', len(self.groups))
+        logging.debug("Storing %d groups in the database", len(self.groups))
         metob.store(self.groups)
 
     def compound_idxs_not_evaluated(self):
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 24c3120f..902ed702 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -2033,6 +2033,28 @@ def plot_score_and_ref_file(ax, score, rt, ref):
         fontsize=2,
         transform=ax.transAxes)
 
+
+def get_refs(file_name, **kwargs):
+    """Load msms refs from file_name, returns pandas Dataframe"""
+    # Reference parameters
+    ref_dtypes = kwargs.pop('ref_dtypes', {'database': str, 'id': str, 'name': str,
+                                           'spectrum': object, 'decimal': int, 'precursor_mz': float,
+                                           'polarity': str, 'adduct': str, 'fragmentation_method': str,
+                                           'collision_energy': str, 'instrument': str, 'instrument_type': str,
+                                           'formula': str, 'exact_mass': float,
+                                           'inchi_key': str, 'inchi': str, 'smiles': str})
+
+    ref_index = kwargs.pop('ref_index', ['database', 'id'])
+    if 'ref_df' in kwargs:
+        ref_df = kwargs.pop('ref_df')
+    else:
+        ref_df = pd.read_csv(file_name,
+                             sep='\t',
+                             dtype=ref_dtypes
+                             ).set_index(ref_index)
+    return ref_df
+
+
 def get_msms_hits(metatlas_dataset, use_labels=False, extra_time=False, keep_nonmatches=False,
                   pre_query='database == "metatlas"',
                   # pre_query = 'index == index or index == @pd.NaT',
@@ -2047,27 +2069,11 @@ def get_msms_hits(metatlas_dataset, use_labels=False, extra_time=False, keep_non
 
     # Reference parameters
     ref_loc = kwargs.pop('ref_loc', '/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab')
-    ref_dtypes = kwargs.pop('ref_dtypes', {'database':str, 'id':str, 'name':str,
-                                           'spectrum':object,'decimal':int, 'precursor_mz':float,
-                                           'polarity':str, 'adduct':str, 'fragmentation_method':str,
-                                           'collision_energy':str, 'instrument':str, 'instrument_type':str,
-                                           'formula':str, 'exact_mass':float,
-                                           'inchi_key':str, 'inchi':str, 'smiles':str})
-
-    ref_index = kwargs.pop('ref_index', ['database', 'id'])
+    ref_df = get_refs(ref_loc, **kwargs)
     if 'do_centroid' in kwargs:
         do_centroid = kwargs.pop('do_centroid')
     else:
         do_centroid = False
-
-    if 'ref_df' in kwargs:
-        ref_df = kwargs.pop('ref_df')
-    else:
-        ref_df = pd.read_csv(ref_loc,
-                             sep='\t',
-                             dtype=ref_dtypes
-                            ).set_index(ref_index)
-
     ref_df = ref_df.query(pre_query, local_dict=dict(locals(), **kwargs)).copy()
 
     if ref_df['spectrum'].apply(type).eq(str).all():
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 802761a1..d06b668b 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -1788,3 +1788,294 @@ def fixture_hits():
     )
     hits_plus.drop(columns=["copy_index"], inplace=True)
     return hits_plus
+
+
+@pytest.fixture(name="msms_refs")
+def fixture_msms_refs():
+    return (
+        pd.DataFrame(
+            data={
+                "name": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "2'-deoxyadenosine",
+                    ("mona", "KO002730"): "2'-Deoxyadenosine",
+                    ("mona", "KO002729"): "2'-Deoxyadenosine",
+                    ("mona", "KO008947"): "2'-Deoxyadenosine",
+                    ("mona", "KO002727"): "2'-Deoxyadenosine",
+                    ("mona", "KO002728"): "2'-Deoxyadenosine",
+                    ("mona", "KO002726"): "2'-Deoxyadenosine",
+                    ("mona", "PR100081"): "2'-Deoxyadenosine",
+                    ("mona", "PR100080"): "2'-Deoxyadenosine",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "2'-deoxyadenosine",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "2'-deoxyadenosine",
+                },
+                "spectrum": {
+                    (
+                        "metatlas",
+                        "c7dddd297e104ca79caea72a90150532",
+                    ): "[[57.0345, 63.3177, 63.3205, 69.0344, 71.0499, 73.0292, 84.9778, 99.0447, 117.055, 118.059, 136.062, 137.066, 236.709, 252.109, 253.112], [176328.0, 328818.0, 274432.0, 197637.0, 896360.0, 1192020.0, 378547.0, 3921880.0, 15737700.0, 266131.0, 144220000.0, 3455270.0, 185227.0, 20960800.0, 1284450.0]]",
+                    (
+                        "mona",
+                        "KO002730",
+                    ): "[[40.9, 43.1, 45.0, 57.1, 67.1, 69.1, 71.1, 72.7, 76.8, 79.0, 80.8, 83.2, 91.8, 92.4, 93.2, 94.1, 95.0, 102.8, 105.3, 107.3, 109.1, 116.8, 119.2, 123.0, 129.9, 136.2, 165.9], [3.501946, 10.700389, 5.447471, 16.536965, 1.945525, 9.727626, 5.642023, 8.171206, 24.513619, 66.731518, 2.918288, 4.474708, 2.529183, 1.750973, 0.583658, 9.533074, 3.891051, 0.972763, 12.062257, 2.140078, 5.058366, 0.389105, 48.44358, 2.529183, 14.007782, 100.0, 0.389105]]",
+                    (
+                        "mona",
+                        "KO002729",
+                    ): "[[35.8, 41.0, 43.1, 45.2, 52.9, 55.2, 57.4, 59.1, 61.4, 69.2, 71.1, 73.0, 77.0, 79.0, 81.3, 83.1, 91.2, 94.0, 99.3, 99.9, 101.1, 103.1, 105.0, 106.7, 107.4, 108.9, 111.1, 115.0, 117.2, 119.1, 120.4, 123.1, 130.1, 135.1, 136.0, 136.9, 141.3, 147.1, 166.0, 170.7], [0.170503, 0.383632, 3.665814, 0.937766, 0.127877, 0.895141, 9.079284, 0.852515, 0.341006, 4.390452, 7.1185, 5.242967, 1.960784, 32.139812, 1.875533, 2.429668, 1.278772, 1.491901, 2.216539, 1.364024, 1.364024, 0.511509, 8.01364, 0.468883, 0.255754, 1.321398, 0.426257, 0.255754, 1.193521, 6.734868, 0.170503, 6.990622, 8.823529, 0.213129, 100.0, 0.468883, 0.085251, 0.29838, 0.639386, 0.127877]]",
+                    (
+                        "mona",
+                        "KO008947",
+                    ): "[[71.1, 73.2, 81.1, 89.2, 94.1, 99.1, 101.0, 109.0, 117.1, 119.1, 128.9, 130.0, 133.3, 136.1, 136.9, 137.8, 149.3, 156.5, 165.1, 187.1, 195.1, 213.8, 215.1, 216.1, 217.1, 223.9, 234.1, 251.0, 252.1, 253.0, 270.9], [0.01998, 0.014577, 0.003889, 0.047639, 0.031539, 0.085402, 0.011502, 0.010675, 0.361156, 0.125255, 0.051259, 0.022955, 0.011046, 100.0, 0.116678, 0.01325, 0.029859, 0.006369, 0.003048, 0.01887, 0.066214, 0.003726, 0.011393, 0.013584, 0.013105, 0.010913, 0.080999, 0.012124, 0.179916, 0.010441, 0.005516]]",
+                    (
+                        "mona",
+                        "KO002727",
+                    ): "[[54.2, 57.3, 59.1, 69.2, 71.1, 72.2, 72.8, 74.9, 78.9, 80.1, 80.8, 83.1, 85.4, 87.0, 88.9, 91.1, 93.8, 95.2, 99.0, 100.0, 101.0, 105.0, 107.0, 109.0, 111.5, 113.0, 115.2, 116.3, 117.2, 119.1, 121.3, 122.2, 123.2, 124.4, 129.1, 130.0, 133.0, 135.1, 136.1, 139.4, 145.7, 149.4, 153.0, 157.4, 158.4, 163.0, 165.3, 166.4, 175.1, 176.4, 179.3, 181.1, 184.0, 184.7, 189.2, 191.5, 199.3, 203.5, 207.2, 217.3, 220.1, 235.3, 252.2], [2.60144, 3.583115, 0.098168, 0.179974, 9.080497, 0.294503, 0.507199, 0.081806, 1.014398, 0.13089, 0.114529, 0.13089, 0.098168, 0.212696, 0.229058, 0.490838, 0.065445, 0.196335, 0.998037, 5.039267, 4.744764, 1.210733, 0.147251, 0.376309, 1.963351, 1.259817, 0.081806, 0.065445, 5.611911, 0.114529, 0.556283, 1.194372, 35.02945, 0.049084, 0.91623, 1.996073, 0.114529, 0.556283, 100.0, 0.114529, 0.081806, 0.147251, 0.098168, 0.081806, 0.179974, 0.114529, 0.147251, 0.768979, 6.25, 0.114529, 0.343586, 0.032723, 0.310864, 0.163613, 0.310864, 0.278141, 0.65445, 0.39267, 0.212696, 1.897906, 0.294503, 7.509817, 3.043194]]",
+                    (
+                        "mona",
+                        "KO002728",
+                    ): "[[36.0, 42.8, 55.4, 57.3, 59.3, 60.8, 68.8, 71.0, 72.8, 76.2, 77.4, 79.1, 80.9, 83.4, 85.3, 87.3, 88.9, 91.0, 93.2, 95.0, 97.0, 99.1, 100.2, 101.1, 102.4, 105.1, 107.0, 109.2, 111.2, 112.9, 117.0, 119.4, 121.0, 122.5, 123.2, 128.9, 130.2, 133.2, 136.2, 150.9, 158.0, 161.1, 163.0, 166.3, 175.2, 179.2, 189.0, 191.2, 207.1, 217.5, 235.3], [0.804783, 0.66682, 0.229938, 6.829156, 0.459876, 0.091975, 2.230398, 10.255231, 3.173143, 0.137963, 0.160957, 13.152449, 0.896758, 1.425615, 0.206944, 0.091975, 0.436882, 0.413888, 0.137963, 0.551851, 0.18395, 3.885951, 2.644286, 2.943205, 0.091975, 4.828696, 0.275926, 0.505863, 1.241665, 0.229938, 4.621752, 0.804783, 0.252932, 0.252932, 20.303518, 0.298919, 6.36928, 0.229938, 100.0, 0.045988, 0.321913, 0.229938, 0.068981, 1.172683, 1.057714, 1.034721, 0.298919, 0.068981, 0.114969, 0.344907, 2.023454]]",
+                    (
+                        "mona",
+                        "KO002726",
+                    ): "[[54.0, 57.2, 71.1, 72.2, 73.5, 77.7, 80.2, 82.4, 87.0, 90.3, 100.0, 101.2, 104.6, 106.0, 108.3, 109.4, 111.1, 112.3, 113.3, 116.4, 117.3, 118.2, 121.3, 122.3, 123.2, 125.9, 129.0, 129.9, 131.2, 135.1, 136.2, 137.4, 139.4, 140.9, 143.8, 146.3, 148.2, 152.5, 153.1, 159.7, 162.1, 166.3, 171.1, 175.2, 177.1, 178.0, 179.0, 180.1, 184.1, 185.5, 188.0, 192.2, 198.2, 199.2, 202.6, 203.1, 206.9, 207.4, 216.3, 217.6, 220.2, 224.2, 234.3, 235.2, 252.3], [2.518936, 0.334684, 3.399683, 11.044566, 0.052845, 0.334684, 0.193764, 0.088075, 0.07046, 2.096178, 7.02836, 1.514885, 0.10569, 0.052845, 0.546063, 0.140919, 0.140919, 0.10569, 24.255769, 0.140919, 0.352299, 0.211379, 0.334684, 4.192355, 38.400564, 0.176149, 0.123305, 0.052845, 0.140919, 0.123305, 37.819271, 0.07046, 0.052845, 0.123305, 0.228994, 0.07046, 0.10569, 0.669368, 1.638189, 0.07046, 0.123305, 1.092126, 0.334684, 10.991721, 0.10569, 0.07046, 0.07046, 0.211379, 2.378017, 0.052845, 0.123305, 5.302096, 0.246609, 0.387529, 0.211379, 0.634138, 0.123305, 0.123305, 0.07046, 7.592038, 1.46204, 0.088075, 1.726264, 59.098115, 100.0]]",
+                    ("mona", "PR100081"): "[[117.0574, 136.0651, 252.1096], [15.868531, 100.0, 48.929209]]",
+                    ("mona", "PR100080"): "[[136.0631, 252.1096], [39.169289, 100.0]]",
+                    (
+                        "metatlas",
+                        "e0025042a1a844d6b6926252edce91e5",
+                    ): "[[66.7578, 70.38, 73.6972, 73.9685, 82.2146, 92.3969, 102.12, 104.312, 111.673, 136.062, 139.036, 158.043, 161.337, 168.39, 202.526, 235.987, 246.005, 274.002, 274.091, 274.273], [2649.93, 1977.51, 2080.95, 2643.01, 2450.61, 2214.72, 2214.78, 2349.55, 2163.28, 2982.16, 9507.9, 29909.8, 2525.4, 2199.08, 2170.93, 2443.12, 3793.61, 24676.1, 534389.0, 2775.85]]",
+                    (
+                        "metatlas",
+                        "0568278b45d244fcb5787792fc17b3ec",
+                    ): "[[51.5615, 57.0342, 64.0128, 69.0341, 71.0498, 73.029, 73.9804, 81.0338, 82.4275, 88.5237, 93.5638, 99.0444, 105.478, 117.055, 118.698, 126.793, 136.062, 252.108, 252.133], [845648.0, 896704.0, 912599.0, 2052520.0, 5955880.0, 8407590.0, 965782.0, 1548360.0, 1093910.0, 924679.0, 809760.0, 17986900.0, 949617.0, 56688000.0, 1347680.0, 891451.0, 468230000.0, 73715000.0, 1526730.0]]",
+                },
+                "decimal": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): 4,
+                    ("mona", "KO002730"): 3,
+                    ("mona", "KO002729"): 3,
+                    ("mona", "KO008947"): 1,
+                    ("mona", "KO002727"): 3,
+                    ("mona", "KO002728"): 3,
+                    ("mona", "KO002726"): 3,
+                    ("mona", "PR100081"): 4,
+                    ("mona", "PR100080"): 4,
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): 4,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): 4,
+                },
+                "precursor_mz": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): 252.109,
+                    ("mona", "KO002730"): 252.0,
+                    ("mona", "KO002729"): 252.0,
+                    ("mona", "KO008947"): 252.0,
+                    ("mona", "KO002727"): 252.0,
+                    ("mona", "KO002728"): 252.0,
+                    ("mona", "KO002726"): 252.0,
+                    ("mona", "PR100081"): 252.10963,
+                    ("mona", "PR100080"): 252.10963,
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): 274.091,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): 252.109,
+                },
+                "polarity": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "positive",
+                    ("mona", "KO002730"): "positive",
+                    ("mona", "KO002729"): "positive",
+                    ("mona", "KO008947"): "positive",
+                    ("mona", "KO002727"): "positive",
+                    ("mona", "KO002728"): "positive",
+                    ("mona", "KO002726"): "positive",
+                    ("mona", "PR100081"): "positive",
+                    ("mona", "PR100080"): "positive",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "positive",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "positive",
+                },
+                "adduct": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): np.nan,
+                    ("mona", "KO002730"): "[M+H]+",
+                    ("mona", "KO002729"): "[M+H]+",
+                    ("mona", "KO008947"): "[M+H]+",
+                    ("mona", "KO002727"): "[M+H]+",
+                    ("mona", "KO002728"): "[M+H]+",
+                    ("mona", "KO002726"): "[M+H]+",
+                    ("mona", "PR100081"): "[M+H]+",
+                    ("mona", "PR100080"): "[M+H]+",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "[M+Na]+",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "[M+H]+",
+                },
+                "fragmentation_method": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "cid",
+                    ("mona", "KO002730"): np.nan,
+                    ("mona", "KO002729"): np.nan,
+                    ("mona", "KO008947"): np.nan,
+                    ("mona", "KO002727"): np.nan,
+                    ("mona", "KO002728"): np.nan,
+                    ("mona", "KO002726"): np.nan,
+                    ("mona", "PR100081"): "LOW-ENERGY CID",
+                    ("mona", "PR100080"): "LOW-ENERGY CID",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "cid",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "cid",
+                },
+                "collision_energy": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "0",
+                    ("mona", "KO002730"): "50 V",
+                    ("mona", "KO002729"): "40 V",
+                    ("mona", "KO008947"): "0.65",
+                    ("mona", "KO002727"): "20 V",
+                    ("mona", "KO002728"): "30 V",
+                    ("mona", "KO002726"): "10 V",
+                    ("mona", "PR100081"): "30 V",
+                    ("mona", "PR100080"): "Ramp 5-60 V",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): np.nan,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): np.nan,
+                },
+                "instrument": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): np.nan,
+                    ("mona", "KO002730"): np.nan,
+                    ("mona", "KO002729"): np.nan,
+                    ("mona", "KO008947"): np.nan,
+                    ("mona", "KO002727"): np.nan,
+                    ("mona", "KO002728"): np.nan,
+                    ("mona", "KO002726"): np.nan,
+                    ("mona", "PR100081"): np.nan,
+                    ("mona", "PR100080"): np.nan,
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): np.nan,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): np.nan,
+                },
+                "instrument_type": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): np.nan,
+                    ("mona", "KO002730"): "LC-ESI-QQ",
+                    ("mona", "KO002729"): "LC-ESI-QQ",
+                    ("mona", "KO008947"): "LC-ESI-IT",
+                    ("mona", "KO002727"): "LC-ESI-QQ",
+                    ("mona", "KO002728"): "LC-ESI-QQ",
+                    ("mona", "KO002726"): "LC-ESI-QQ",
+                    ("mona", "PR100081"): "LC-ESI-QTOF",
+                    ("mona", "PR100080"): "LC-ESI-QTOF",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): np.nan,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): np.nan,
+                },
+                "formula": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "C10H13N5O3",
+                    ("mona", "KO002730"): "C10H13N5O3",
+                    ("mona", "KO002729"): "C10H13N5O3",
+                    ("mona", "KO008947"): "C10H13N5O3",
+                    ("mona", "KO002727"): "C10H13N5O3",
+                    ("mona", "KO002728"): "C10H13N5O3",
+                    ("mona", "KO002726"): "C10H13N5O3",
+                    ("mona", "PR100081"): "C10H13N5O3",
+                    ("mona", "PR100080"): "C10H13N5O3",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "C10H13N5O3",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "C10H13N5O3",
+                },
+                "exact_mass": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): 251.101839276,
+                    ("mona", "KO002730"): 251.101839276,
+                    ("mona", "KO002729"): 251.101839276,
+                    ("mona", "KO008947"): 251.101839276,
+                    ("mona", "KO002727"): 251.101839276,
+                    ("mona", "KO002728"): 251.101839276,
+                    ("mona", "KO002726"): 251.101839276,
+                    ("mona", "PR100081"): 251.101839276,
+                    ("mona", "PR100080"): 251.101839276,
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): 251.101839276,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): 251.101839276,
+                },
+                "inchi_key": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO002730"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO002729"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO008947"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO002727"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO002728"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "KO002726"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "PR100081"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("mona", "PR100080"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                },
+                "inchi": {
+                    (
+                        "metatlas",
+                        "c7dddd297e104ca79caea72a90150532",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO002730",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO002729",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO008947",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO002727",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO002728",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "KO002726",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "PR100081",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "mona",
+                        "PR100080",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "metatlas",
+                        "e0025042a1a844d6b6926252edce91e5",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                    (
+                        "metatlas",
+                        "0568278b45d244fcb5787792fc17b3ec",
+                    ): "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                },
+                "smiles": {
+                    ("metatlas", "c7dddd297e104ca79caea72a90150532"): np.nan,
+                    (
+                        "mona",
+                        "KO002730",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "KO002729",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "KO008947",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "KO002727",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "KO002728",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "KO002726",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "PR100081",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    (
+                        "mona",
+                        "PR100080",
+                    ): "[H]OC([H])([H])C1([H])OC([H])(N2C([H])=NC=3C(=NC([H])=NC32)N([H])[H])C([H])([H])C1([H])O[H]",
+                    ("metatlas", "e0025042a1a844d6b6926252edce91e5"): np.nan,
+                    ("metatlas", "0568278b45d244fcb5787792fc17b3ec"): np.nan,
+                },
+            }
+        )
+        .rename_axis(index=["database", "id"])
+        .iloc[0:1]
+    )
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index 6aee1a09..3989ff6b 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -98,3 +98,10 @@ def test_strong_signal_compound_idxs(metatlas_dataset):
     assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 1, 2.36e6) == []
     assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 73, 1e4) == [0]
     assert dill2plots.strong_signal_compound_idxs(metatlas_dataset, 74, 1e4) == []
+
+
+def test_get_msms_hits01(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
+    hits = dill2plots.get_msms_hits(metatlas_dataset)
+    expected = """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1090698242,null,252.1557617188],[null,null,null,null,null,null,null,null,null,null,null,null,null,76976.7265625,null,6090.6440429688]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"2\'-deoxyadenosine","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    assert expected == hits.to_json()

From 936858e060fb06041882da04b7ddb23190c1b371 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 2 Jul 2021 18:01:12 -0700
Subject: [PATCH 039/177] force hit generation if RT min/max change

Before generating outputs, msms hits must be regenerated
if RT min/max changed.
---
 metatlas/datastructures/metatlas_dataset.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index b3cab3b3..721abc2f 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -191,7 +191,8 @@ def __init__(
         self._data = None
         self._data_valid = False
         self._hits = None
-        self._hits_valid = False
+        self._hits_valid = False  # based on all hits dependencies except RT min/max values
+        self._hits_valid_for_rt_bounds = False  # based only on RT min/max changes
         self._groups = None
         self._groups_valid = False
         self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
@@ -568,6 +569,7 @@ def hits(self):
             )
             logger.info("Generated %d hits in %s.", len(self._hits), _duration_since(start_time))
             self._hits_valid = True
+            self._hits_valid_for_rt_bounds = True
         return self._hits
 
     def __len__(self):
@@ -606,6 +608,8 @@ def set_rt(self, compound_idx, which, time):
             setattr(sample[compound_idx]["identification"].rt_references[0], which, time)
         self.atlas_df.loc[compound_idx, which] = time
         metob.store(atlas_rt_ref)
+        if which in ["rt_min", "rt_max"]:
+            self._hits_valid_for_rt_bounds = False
 
     def set_note(self, compound_idx, which, value):
         """
@@ -801,6 +805,8 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             msms_fragment_ions: if True, generate msms fragment ions report
             overwrite: if False, throw error if any output files already exist
         """
+        if not self._hits_valid_for_rt_bounds:
+            self._hits_valid = False  # force hits to be regenerated
         self.extra_time = 0.5
         logger.info("extra_time set to 0.5 minutes for output generation.")
         targeted_output.write_atlas_to_spreadsheet(self, overwrite)

From eda072bed78de877068b07957443b56e27544dfb Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 8 Jul 2021 09:10:08 -0700
Subject: [PATCH 040/177] Update system test image name

---
 docker/local_jupyter.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/local_jupyter.sh b/docker/local_jupyter.sh
index 4dc0dda6..eda62d18 100755
--- a/docker/local_jupyter.sh
+++ b/docker/local_jupyter.sh
@@ -5,7 +5,7 @@ set -euf -o pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR=$(dirname "$SCRIPT_DIR")
 OUT_DIR="${SCRIPT_DIR}/out"
-IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0'
+IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.3.0'
 PORT=8888
 
 while [[ "$#" -gt 0 ]]; do

From 1e65c8f04ee8a46baa6f26cec5c3e6358d8f639b Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 8 Jul 2021 09:11:36 -0700
Subject: [PATCH 041/177] Add markdown block about skipping GUI step

---
 notebooks/reference/Targeted.ipynb | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 561009f5..087049bb 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -174,6 +174,14 @@
     "    metatlas_dataset.filter_compounds_by_signal(num_points=num_points, peak_height=peak_height)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Annotation GUI\n",
+    "If you are re-running this notebook and do not need to make additional changes to RT min/max bounds, then you can skip running the next code cell. Skipping will save you from calculating MSMS hits twice."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -206,7 +214,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -220,7 +228,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.11"
   }
  },
  "nbformat": 4,

From c7bee2768910fa3a9fb156d91f41f61fa2a48a12 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 8 Jul 2021 09:39:00 -0700
Subject: [PATCH 042/177] WIP - Add RT_Prediction notebook

---
 metatlas/tools/predict_rt.py            |  14 +++
 notebooks/reference/RT_Prediction.ipynb | 153 ++++++++++++++++++++++++
 noxfile.py                              |   6 +-
 3 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/reference/RT_Prediction.ipynb

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 44af1fa6..e88c80ce 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -403,3 +403,17 @@ def create_adjusted_atlases(linear, poly, qc_dir, atlas_indices=None, free_text=
                         mz_tolerance=12,
                     )
                 print(prd_atlas_name + " Created!")
+
+
+def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number=0, polarity="positive"):
+    """
+    Simplified interface for generating an AnalysisIds instance for use in rt prediction
+    inputs:
+        experiment: name of experiment as given in LCMS run names
+        project_directory: directory where per-experiment output directory will be created
+        analysis_number: integer, defaults to 0, increment if redoing analysis
+        polarity: defaults to 'positive', set to 'negative' if you only have neg mode data
+    Returns an AnalysisIds instance
+    """
+    ids = mads.AnalysisIdentifiers(None, experiment, "ISTDsEtc", polarity, analysis_number, project_directory)
+    return ids
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
new file mode 100644
index 00000000..0952c263
--- /dev/null
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parameters\n",
+    "\n",
+    "The next code block sets parameters that are used throughout the remainder of the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# pylint: disable=invalid-name,missing-module-docstring\n",
+    "\n",
+    "# an integer, increment if you need to redo your analysis\n",
+    "# will be appended to your username to create analysis_id\n",
+    "analysis_number = 0\n",
+    "\n",
+    "# experiment ID that must match the parent folder containing the LCMS output files\n",
+    "# An example experiment ID is '20201116_JGI-AK_LH_506489_SoilWarm_final_QE-HF_HILICZ_USHXG01530'\n",
+    "experiment = \"REPLACE ME\"\n",
+    "\n",
+    "# group will only be used in RT prediction if their name has a substring match to this list of strings\n",
+    "include_groups = [\"S1\"]\n",
+    "\n",
+    "# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']\n",
+    "exclude_files = []\n",
+    "\n",
+    "# list of substrings that will group together when creating groups\n",
+    "# this provides additional grouping beyond the default grouping on field #12\n",
+    "groups_controlled_vocab = [\"QC\", \"InjBl\", \"ISTD\"]\n",
+    "\n",
+    "# The rest of this block contains project independent parameters\n",
+    "\n",
+    "# Full path to the directory where you have cloned the metatlas git repo.\n",
+    "# If you ran the 'git clone ...' command in your home directory on Cori,\n",
+    "# then you'll want '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'\n",
+    "# where the uppercase letters are replaced based on your NERSC username.\n",
+    "metatlas_repo_path = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas\"\n",
+    "\n",
+    "# Full path to the directory where you want this notebook to store data.\n",
+    "# A subdirectory will be auto created within this directory for each project.\n",
+    "# You can place this anywhere on cori's filesystem, but placing it within your\n",
+    "# global home directory is recommended so that you do not need to worry about\n",
+    "# your data being purged. Each project will take on the order of 100 MB.\n",
+    "project_directory = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects\"\n",
+    "\n",
+    "# maximum number of CPUs to use\n",
+    "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
+    "max_cpus = 4\n",
+    "\n",
+    "# Threshold for how much status information metatlas functions print in the notebook\n",
+    "# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'\n",
+    "log_level = \"INFO\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pylint: disable=wrong-import-position,import-error\n",
+    "import logging  # noqa: E402\n",
+    "import os  # noqa: E402\n",
+    "import sys  # noqa: E402\n",
+    "\n",
+    "sys.path.insert(0, metatlas_repo_path)\n",
+    "logger = logging.getLogger(\"metatlas.jupyter\")\n",
+    "logger.debug(\"sys.executable=%s\", sys.executable)\n",
+    "logger.debug(\"sys.path=%s.\", sys.path)\n",
+    "logger.debug(\"metatlas_repo_path=%s.\", metatlas_repo_path)\n",
+    "if not os.path.exists(metatlas_repo_path):\n",
+    "    logging.critical(\n",
+    "        \"Directory set for metatlas_repo_path parameter (%s) does not exist or is not accessible.\",\n",
+    "        metatlas_repo_path,\n",
+    "    )\n",
+    "    raise ValueError(\"Invalid metatlas_repo_path parameter in Jupyter Notebook.\")\n",
+    "try:\n",
+    "    from metatlas.tools import notebook  # noqa: E402\n",
+    "    from metatlas.tools import predict_rt  # noqa: E402\n",
+    "except ModuleNotFoundError as err:\n",
+    "    logging.critical(\n",
+    "        (\n",
+    "            \"Could not find metatlas module at %s. \"\n",
+    "            \"In the Parameters block, please check the value of metatlas_repo_path.\"\n",
+    "        ),\n",
+    "        metatlas_repo_path,\n",
+    "    )\n",
+    "    raise ModuleNotFoundError from err\n",
+    "notebook.setup(log_level)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ids = predict_rt.get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predict_rt.generate_rt_correction_models(\n",
+    "    ids, groups_controlled_vocab, exclude_files, include_groups, max_cpus\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/noxfile.py b/noxfile.py
index 537406a9..6a934218 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -34,6 +34,7 @@
 # has not yet been updated to pass all checks.
 notebooks = [
     "notebooks/reference/Targeted.ipynb",
+    "notebooks/reference/RT_Prediction.ipynb",
 ]
 
 pytest_deps = [
@@ -122,7 +123,10 @@ def pylint(session):
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def pylint_nb(session):
     session.install("-r", "docker/requirements.txt", *nbqa_deps, *pylint_deps)
-    session.run("nbqa", "pylint", *notebooks)
+    # dupliate code cannot be disabled on per-cell level https://github.com/PyCQA/pylint/issues/214
+    # Some duplicate code is required to setup the notebook and do error handling.
+    # So turn off duplicate code for whole session -- not ideal.
+    session.run("nbqa", "pylint", "--disable=duplicate-code", *notebooks)
 
 
 @nox.session(python=py_versions[0])

From be171c1da2b33e55823941a571b6e3d039708140 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 8 Jul 2021 09:52:53 -0700
Subject: [PATCH 043/177] Increased debug logging on notebook startup

---
 metatlas/tools/notebook.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 8d44da7f..5546c9e7 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -21,6 +21,8 @@ def configure_environment(log_level):
         log_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
     """
     activate_logging(console_level=log_level)
+    logger.debug('Running import and environment setup block of notebook.')
+    logger.debug('Configuring notebook environment with console log level of %s.', log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 
 
@@ -46,10 +48,12 @@ def validate_kernel():
             'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
         )
         raise ModuleNotFoundError from module_error
+    logger.debug('Kernel validation passed. Using python from %s.', sys.executable)
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
     """Set pandas display options"""
+    logger.debug('Settings pandas display options')
     pd.set_option("display.max_rows", max_rows)
     pd.set_option("display.max_columns", max_columns)
     pd.set_option("display.max_colwidth", max_colwidth)
@@ -80,4 +84,5 @@ def activate_sql_logging(console_level="INFO", console_format=None, file_level="
         filename: logging destination
 
     """
+    logger.debug('Activaing SQL logging with console_level=%s and file_level=%s.', console_level, file_level)
     activate_module_logging("sqlalchemy.engine", console_level, console_format, file_level, filename)

From dacdf46213ed7552e8dbc5b63918a7b95a7315ca Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 8 Jul 2021 11:04:44 -0700
Subject: [PATCH 044/177] fix logging without logger

---
 metatlas/datastructures/metatlas_dataset.py | 16 ++++++++--------
 metatlas/datastructures/object_helpers.py   | 20 ++++++++++----------
 metatlas/plots/dill2plots.py                |  6 +++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 721abc2f..c7ed7238 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -61,7 +61,7 @@ def __init__(
 
     def validate(self):
         """Valid class inputs"""
-        logging.debug("Validating inputs to AnalysisIdentifiers")
+        logger.debug("Validating inputs to AnalysisIdentifiers")
         if self._source_atlas is not None:
             get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
         if len(self.experiment.split("_")) != 9:
@@ -74,7 +74,7 @@ def validate(self):
             raise TypeError("Parameter analysis_number is not an integer.")
         if self.analysis_number < 0:
             raise ValueError("Parameter analysis_number cannot be negative.")
-        logging.debug("Inputs to AnalysisIdentifiers passed validation.")
+        logger.debug("Inputs to AnalysisIdentifiers passed validation.")
 
     @property
     def source_atlas(self):
@@ -180,7 +180,7 @@ def __init__(
             exclude_files: array of strings that will exclude files if they are substrings of the filename
             save_metadata: if True, write metadata files containing data sources and LCMS runs short name
         """
-        logging.debug("Creating new MetatlasDataset instance...")
+        logger.debug("Creating new MetatlasDataset instance...")
         self.ids = ids
         self._atlas = None
         self._atlas_valid = False
@@ -206,7 +206,7 @@ def __init__(
         if ids.source_atlas is not None:
             self._get_atlas()
         if save_metadata:
-            logging.debug("Writing MetatlasDataset metadata files")
+            logger.debug("Writing MetatlasDataset metadata files")
             self.write_data_source_files()
             self.write_lcmsruns_short_names()
         self.store_groups(exist_ok=True)
@@ -265,9 +265,9 @@ def _get_atlas(self):
                 logger.exception(err)
                 raise err
         else:
-            logging.info("Retriving source atlas: %s", self.ids.source_atlas)
+            logger.info("Retriving source atlas: %s", self.ids.source_atlas)
             source = get_atlas(self.ids.source_atlas, self.ids.username)
-            logging.info("Cloning source atlas")
+            logger.info("Cloning source atlas")
             self._atlas = source.clone()
             self._atlas.name = self.ids.atlas
             self._atlas_valid = True
@@ -390,7 +390,7 @@ def filter_compounds_by_signal(self, num_points, peak_height, name=None):
                          in order for the compound to remain in the atlas
             name: the name for the new atlas, defaults to current name
         """
-        logger.debug("Filtering atlas on num_points=%d, peak_height=%d.")
+        logger.debug("Filtering atlas on num_points=%d, peak_height=%d.", num_points, peak_height)
         name = self.atlas.name if name is None else name
         keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
         self.filter_compounds(keep_idxs=keep_idxs, name=name)
@@ -768,7 +768,7 @@ def store_groups(self, exist_ok=False):
             except ValueError as err:
                 logger.exception(err)
                 raise err
-        logging.debug("Storing %d groups in the database", len(self.groups))
+        logger.debug("Storing %d groups in the database", len(self.groups))
         metob.store(self.groups)
 
     def compound_idxs_not_evaluated(self):
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 9f8b4ff4..04b43fe6 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -147,7 +147,7 @@ def __init__(self):
                 self.path = f"sqlite:///{filename}"
                 if os.path.exists(filename):
                     os.chmod(filename, 0o775)
-        logging.debug('Using database at: %s', self.path)
+        logger.debug('Using database at: %s', self.path)
 
         self.tablename_lut = dict()
         self.subclass_lut = dict()
@@ -198,11 +198,11 @@ def convert_to_double(self, table, entry):
         except Exception as e:
             self.db.rollback()
             print(e)
-            logging.error('Transaction rollback within convert_to_double()')
+            logger.error('Transaction rollback within convert_to_double()')
 
     def save_objects(self, objects, _override=False):
         """Save objects to the database"""
-        logging.debug('Entering Workspace.save_objects')
+        logger.debug('Entering Workspace.save_objects')
         if not isinstance(objects, (list, set)):
             objects = [objects]
         self._seen = dict()
@@ -211,7 +211,7 @@ def save_objects(self, objects, _override=False):
         self._inserts = defaultdict(list)
         for obj in objects:
             self._get_save_data(obj, _override)
-        logging.debug('Workspace._inserts=%s', self._inserts)
+        logger.debug('Workspace._inserts=%s', self._inserts)
         self.get_connection()
         self.db.begin()
         try:
@@ -237,11 +237,11 @@ def save_objects(self, objects, _override=False):
                     if 'sqlite' not in self.path:
                         self.fix_table(table_name)
                 self.db[table_name].insert_many(inserts)
-                logging.debug('inserting %s', inserts)
+                logger.debug('inserting %s', inserts)
             self.db.commit()
         except Exception:
             self.db.rollback()
-            logging.error('Transaction rollback within save_objects()')
+            logger.error('Transaction rollback within save_objects()')
 
     def create_link_tables(self, klass):
         """
@@ -264,7 +264,7 @@ def create_link_tables(self, klass):
             self.db.commit()
         except Exception:
             self.db.rollback()
-            logging.error('Transaction rollback within create_link_tables()')
+            logger.error('Transaction rollback within create_link_tables()')
 
     def _get_save_data(self, obj, override=False):
         """Get the data that will be used to save an object to the database"""
@@ -418,7 +418,7 @@ def retrieve(self, object_type, **kwargs):
             self.db.commit()
         except Exception:
             self.db.rollback()
-            logging.error('Transaction rollback within retrieve()')
+            logger.error('Transaction rollback within retrieve()')
         return items
 
     def remove(self, object_type, **kwargs):
@@ -487,7 +487,7 @@ def remove(self, object_type, **kwargs):
             self.db.commit()
         except Exception:
             self.db.rollback()
-            logging.error('Transaction rollback within retrieve()')
+            logger.error('Transaction rollback within retrieve()')
 
     def remove_objects(self, objects, all_versions=True, **kwargs):
         """Remove a list of objects from the database."""
@@ -532,7 +532,7 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
             self.db.commit()
         except Exception:
             self.db.rollback()
-            logging.error('Transaction rollback within remove_objects()')
+            logger.error('Transaction rollback within remove_objects()')
 
 
 def format_timestamp(tstamp):
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 902ed702..099cc196 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -797,7 +797,7 @@ def __init__(self,
         # warn the user if they do not own the atlas; and can not edit its values
         self.enable_edit = True
         self.atlas = metob.retrieve('Atlas',unique_id = self.data[0][0]['atlas_unique_id'],username='*')[-1]
-        logging.info("loaded file for username = %s", self.atlas.username)
+        logger.info("loaded file for username = %s", self.atlas.username)
         if getpass.getuser() != self.atlas.username:
             self.ax.set_title("YOUR ARE %s YOU ARE NOT ALLOWED TO EDIT VALUES THE RT CORRECTOR. USERNAMES ARE NOT THE SAME"%getpass.getuser())
             self.enable_edit = False
@@ -3058,7 +3058,7 @@ def make_atlas_from_spreadsheet(filename, atlas_name, filetype, sheetname=None,
     specify polarity as 'positive' or 'negative'
 
     '''
-    logging.debug('Generating atlas named %s from %s source.', atlas_name, filetype)
+    logger.debug('Generating atlas named %s from %s source.', atlas_name, filetype)
     atlas_df = _get_dataframe(filename, filetype, sheetname)
     _clean_dataframe(atlas_df, required_columns=['inchi_key', 'label'])
     _add_columns(atlas_df, column_names=['adduct'], default_values=[np.NaN])
@@ -3066,7 +3066,7 @@ def make_atlas_from_spreadsheet(filename, atlas_name, filetype, sheetname=None,
     check_filenames(atlas_df, 'file_msms')
     atlas = get_atlas(atlas_name, atlas_df, polarity, mz_tolerance)
     if store:
-        logging.debug('Saving atlas named %s to DB.', atlas_name)
+        logger.debug('Saving atlas named %s to DB.', atlas_name)
         metob.store(atlas)
     return atlas
 

From cc5973d52a15e1ddc7a554e0d9e7935cc0efdfc6 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 9 Jul 2021 10:57:04 -0700
Subject: [PATCH 045/177] Improved DB connection pooling

Added back some test from several years ago.
---
 metatlas/datastructures/metatlas_objects.py   |  18 +-
 metatlas/datastructures/object_helpers.py     |  29 +-
 .../tests/test_metatlas_objects.py            | 254 +-----------------
 metatlas/tools/notebook.py                    |  10 +-
 tests/unit/conftest.py                        |   1 +
 tests/unit/test_metatlas_dataset.py           |  14 +-
 tests/unit/test_metatlas_objects.py           | 234 ++++++++++++++++
 7 files changed, 284 insertions(+), 276 deletions(-)
 create mode 100644 tests/unit/test_metatlas_objects.py

diff --git a/metatlas/datastructures/metatlas_objects.py b/metatlas/datastructures/metatlas_objects.py
index 64397e76..e84e3a72 100644
--- a/metatlas/datastructures/metatlas_objects.py
+++ b/metatlas/datastructures/metatlas_objects.py
@@ -41,6 +41,7 @@
 POLARITY = ('positive', 'negative', 'alternating')
 FRAGMENTATION_TECHNIQUE = ('hcd','cid','etd','ecd','irmpd')
 
+
 def retrieve(object_type, **kwargs):
     """Get objects from the Metatlas object database.
 
@@ -63,7 +64,10 @@ def retrieve(object_type, **kwargs):
       List of Metatlas Objects meeting the criteria.  Will return the
       latest version of each object.
     """
-    return Workspace.get_instance().retrieve(object_type, **kwargs)
+    workspace = Workspace.get_instance()
+    out = workspace.retrieve(object_type, **kwargs)
+    workspace.close_connection()
+    return out
 
 
 def remove(object_type, **kwargs):
@@ -81,7 +85,9 @@ def remove(object_type, **kwargs):
     if not isinstance(object_type, str):
         print('remove() expects a string argument, use remove_objects() to'
               'delete actual objects.')
-    return Workspace.get_instance().remove(object_type, **kwargs)
+    workspace = Workspace.get_instance()
+    workspace.remove(object_type, **kwargs)
+    workspace.close_connection()
 
 
 def remove_objects(objects, all_versions=True, **kwargs):
@@ -96,7 +102,9 @@ def remove_objects(objects, all_versions=True, **kwargs):
     if isinstance(objects, str):
         print('remove_objects() expects actual objects, use remove() to'
               'remove objects by type.')
-    return Workspace.get_instance().remove_objects(objects, all_versions, **kwargs)
+    workspace = Workspace.get_instance()
+    workspace.remove_objects(objects, all_versions, **kwargs)
+    workspace.close_connection()
 
 
 def store(objects, **kwargs):
@@ -107,7 +115,9 @@ def store(objects, **kwargs):
     objects: Metatlas object or list of Metatlas Objects
         Object(s) to store in the database.
     """
-    Workspace.get_instance().save_objects(objects, **kwargs)
+    workspace = Workspace.get_instance()
+    workspace.save_objects(objects, **kwargs)
+    workspace.close_connection()
 
 
 @set_docstring
diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 04b43fe6..6515cadf 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -164,6 +164,7 @@ def __init__(self):
         # handle circular references
         self.seen = dict()
         Workspace.instance = self
+        self.db = None
 
     @classmethod
     def get_instance(cls):
@@ -177,16 +178,17 @@ def get_connection(self):
         Each activity that queries the database needs to have this function preceeding it.
         """
         try:
-            if self.db.engine.name == 'mysql':
-                self.db.query('show tables')
-            else:
-                self.db.query('SELECT name FROM sqlite_master WHERE type = "table"')
+            self.db.begin()
+            self.db.query('SELECT 1')
+            self.db.commit()
         except Exception:
             self.db = dataset.connect(self.path)
+        assert self.db is not None
 
     def close_connection(self):
-        self.db.close()
-        self.db = None
+        if self.db is not None:
+            self.db.close()
+            self.db = None
 
     def convert_to_double(self, table, entry):
         """Convert a table column to double type."""
@@ -364,7 +366,7 @@ def retrieve(self, object_type, **kwargs):
             query = 'select * from `%s` where (' % object_type
             clauses = []
             for (key, value) in kwargs.items():
-                if type(value) is list and len(value)>0:
+                if isinstance(value, list) and len(value) > 0:
                     clauses.append('%s in ("%s")' % (key, '", "'.join(value)))
                 elif not isinstance(value, six.string_types):
                     clauses.append("%s = %s" % (key, value))
@@ -379,13 +381,12 @@ def retrieve(self, object_type, **kwargs):
                 query = query.replace(' where ()', '')
             try:
                 items = list(self.db.query(query))
-            except Exception as e:
-                if 'Unknown column' in str(e):
+            except Exception as err:
+                if 'Unknown column' in str(err):
                     keys = [k for k in klass.class_traits().keys()
                             if not k.startswith('_')]
-                    raise ValueError('Invalid column name, valid columns: %s' % keys)
-                else:
-                    raise(e)
+                    raise ValueError('Invalid column name, valid columns: %s' % keys) from err
+                raise err
             items = [klass(**i) for i in items]
             uids = [i.unique_id for i in items]
             if not items:
@@ -416,8 +417,10 @@ def retrieve(self, object_type, **kwargs):
                 i._changed = False
             items.sort(key=lambda x: x.last_modified)
             self.db.commit()
-        except Exception:
+        except Exception as err:
+            logger.exception(err)
             self.db.rollback()
+            raise err
             logger.error('Transaction rollback within retrieve()')
         return items
 
diff --git a/metatlas/datastructures/tests/test_metatlas_objects.py b/metatlas/datastructures/tests/test_metatlas_objects.py
index 943d5f64..c7388997 100644
--- a/metatlas/datastructures/tests/test_metatlas_objects.py
+++ b/metatlas/datastructures/tests/test_metatlas_objects.py
@@ -1,173 +1,4 @@
-
-from __future__ import absolute_import
-from metatlas import metatlas_objects as mo
-from metatlas.mzml_loader import get_test_data
-import getpass
-import dill
-
-
-def test_simple():
-    test = mo.Group()
-    uid = test.unique_id
-    mo.store(test)
-    assert test.unique_id == uid
-    assert test.prev_uid != ''
-    test.name = 'hello'
-    mo.store(test)
-    assert test.unique_id == uid
-    assert test.prev_uid != ''
-
-
-def test_nested():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    assert len(test.items) == 2
-    test.items[1].name = 'hello'
-    orig_sub_version = test.items[1].unique_id
-    assert len(test.items) == 2
-    mo.store(test)
-    assert test.items[1].unique_id == orig_sub_version
-
-
-def test_recover():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    test.name = 'howdy'
-    top_version = test.unique_id
-    sub_version = test.items[1].unique_id
-
-    mo.store(test)
-    mo.store(test)  # should have no effect
-    assert len(test.items) == 2
-    assert test.unique_id == top_version
-
-    # make sure we can recover the previous version
-    test.items = []
-    assert test.unique_id == top_version
-    test = mo.retrieve('group', unique_id=top_version)[0]
-    assert test.unique_id == top_version
-    assert len(test.items) == 2, len(test.items)
-    assert test.unique_id == top_version
-    assert test.items[1].unique_id == sub_version
-
-
-def test_unique_links():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    sub_version = test.items[1].unique_id
-    test.items = [test.items[1]]
-    mo.store(test)
-
-    test.items = []
-    test = mo.retrieve('group', unique_id=test.unique_id)[0]
-    assert len(test.items) == 1, len(test.items)
-    assert test.items[0].unique_id == sub_version
-
-
-def test_circular_reference():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    orig_id = test.unique_id
-    test.items[0].items.append(test)
-    mo.store(test)
-    test.items = []
-    test = mo.retrieve('group', unique_id=test.unique_id)[0]
-    sub0 = test.items[0]
-    assert len(sub0.items) == 2, sub0.items
-    assert sub0.items[1].unique_id == orig_id
-    assert test.unique_id == orig_id
-
-
-def test_simple_query():
-    test1 = mo.LcmsRun(name='First')
-    first_version = test1.unique_id
-    test1.description = "Hey there"
-    mo.store(test1)
-    assert test1.unique_id == first_version
-    items = mo.retrieve('lcmsrun', name='First')
-    assert items[-1].unique_id == test1.unique_id
-    assert all([i.unique_id != first_version for i in items[:-1]])
-
-
-def test_glob_query():
-    test1 = mo.LcmsRun(name='First')
-    test2 = mo.LcmsRun(name='Second')
-    test3 = mo.LcmsRun(name='Third')
-    mo.store([test1, test2, test3])
-    items = mo.retrieve('lcmsrun', name='Fir%')
-    assert items[-1].unique_id == test1.unique_id
-    items = mo.retrieve('lcmsrun', name='%econd')
-    assert items[-1].unique_id == test2.unique_id
-    items = mo.retrieve('LcmsRuns', name='T%ir%')
-    assert items[-1].unique_id == test3.unique_id
-
-
-def test_escape_glob():
-    test1 = mo.LcmsRun(description='Flow %')
-    mo.store(test1)
-    items = mo.retrieve('lcmsrun', description='Flow %%')
-    assert items[-1].unique_id == test1.unique_id
-
-
-def test_load_lcms_files():
-    paths = list(get_test_data().values())
-    runs = mo.load_lcms_files(paths)
-    for run in runs:
-        assert run.mzml_file
-        assert run.hdf5_file
-        assert run.creation_time
-        assert run.description
-        assert run.name
-        assert run.last_modified
-        assert run.username
-        assert run.unique_id
-        assert mo.retrieve('lcmsrun', unique_id=run.unique_id)
-
-
-def test_id_grade_trait():
-    e = mo.IdentificationGrade(name='E')
-    mo.store(e)
-    cid = mo.CompoundIdentification(identification_grade='e')
-    assert cid.identification_grade.unique_id == e.unique_id
-
-
-def test_list_item_changed():
-    g = mo.Group()
-    g.items = []
-    g._changed = False
-    g.items.append(mo.Group())
-    assert g._changed
-
-
-def test_preserve_provenance():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    test2 = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    mo.store([test, test2])
-    assert len(test.items) == 2
-    test.items = []
-    test2.items = []
-    mo.store([test, test2])
-    assert len(test.items) == 0
-    previous = mo.retrieve('group', unique_id=test.prev_uid)[0]
-    assert len(previous.items) == 2, repr(previous)
-
-
-def test_clone():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    test2 = test.clone()
-    assert test2.unique_id != test.unique_id
-    assert test2.prev_uid == test.unique_id
-    assert test2.items[0].unique_id == test.items[0].unique_id
-
-    test3 = test.clone(True)
-    assert test3.unique_id != test.unique_id
-    assert test3.prev_uid == test.unique_id
-    assert test3.items[0].unique_id != test.items[0].unique_id
-    assert test3.items[0].prev_uid == test.items[0].unique_id
-
-
-def test_store_stubs():
-    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
-    mo.store(test)
-    test = mo.retrieve('group', unique_id=test.unique_id)[0]
-    assert isinstance(test.items[0], mo.Group)
-    mo.store(test)
+from metatlas.datastructures import metatlas_objects as mo
 
 
 def test_get_latest():
@@ -180,89 +11,6 @@ def test_get_latest():
     assert test[0].name == 'goodbye'
 
 
-def test_user_preserve():
-    run = mo.LcmsRun(username='foo')
-    test = mo.Reference(name='hello', username='foo', lcms_run=run)
-    orig_id = test.unique_id
-    mo.store(test, _override=True)
-    assert test.unique_id == orig_id
-    mo.store(test)
-    assert test.unique_id != orig_id
-    items = mo.retrieve('references', username='*', name='hello')
-    username = getpass.getuser()
-    assert items[-2].username == 'foo'
-    assert items[-1].username == username
-    assert items[-2].lcms_run.username == 'foo'
-    assert items[-1].lcms_run.username == 'foo'
-    run.name = 'hello'
-    mo.store(test)
-    items = mo.retrieve('references', username='*',
-                        creation_time=test.creation_time)
-    return
-    assert items[0].lcms_run.username == 'foo'
-    assert items[1].lcms_run.username == username
-
-
-def test_store_all():
-    items = []
-    for klass in mo.workspace.subclass_lut.values():
-        items.append(klass())
-    mo.store(items)
-    for klass in mo.workspace.subclass_lut.values():
-        name = klass.__name__
-        assert len(mo.retrieve(name))
-
-
-def test_stub_instance():
-    run = mo.LcmsRun(username='foo')
-    test = mo.Reference(name='hello', lcms_run=run)
-    mo.store(test, _override=True)
-    item = mo.retrieve('reference', name='hello')[0]
-    assert isinstance(item.lcms_run, mo.LcmsRun)
-
-
-def test_floating_point():
-    compound = mo.Compound(name='foo', mono_isotopic_molecular_weight=1.0)
-    mo.store(compound)
-    compound.mono_isotopic_molecular_weight = 1.000007
-    mo.store(compound)
-    test = mo.retrieve('compound', name='foo')[-1]
-    assert test.mono_isotopic_molecular_weight == 1.000007, test.mono_isotopic_molecular_weight
-
-
-def test_remove():
-    group = mo.Group(name='foo', items=[mo.Group(name='baz', description='hello')])
-    sub_id = group.items[0].unique_id
-    mo.store(group)
-    db = mo.retrieve('groups', unique_id=sub_id)[0]
-    assert db.unique_id == sub_id
-    mo.remove('groups', name='foo', _override=True)
-    test = mo.retrieve('groups', name='foo')
-    assert not test
-    test_sub = mo.retrieve('groups_items', target_id=sub_id)
-    assert not test_sub
-
-
-def test_remove_objects():
-    group = mo.Group(name='foo', items=[mo.Group(name='baz', description='hello')])
-    sub_id = group.items[0].unique_id
-    mo.store(group)
-    db = mo.retrieve('groups', unique_id=sub_id)[0]
-    assert db.unique_id == sub_id
-    mo.remove_objects(group, _override=True)
-    test = mo.retrieve('groups', name='foo')
-    assert not test
-    test_sub = mo.retrieve('groups_items', target_id=sub_id)
-    assert not test_sub
-
-
-def test_dill():
-    test = mo.Group(items=[mo.Group(description='hello')])
-    blob = dill.dumps(test)
-    new = dill.loads(blob)
-    assert new.items[0].description == 'hello'
-
-
 def test_retrieve_head():
     test = mo.LcmsRun(name='foo')
     mo.store(test)
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 5546c9e7..810d6aa4 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -21,8 +21,8 @@ def configure_environment(log_level):
         log_level: one of 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
     """
     activate_logging(console_level=log_level)
-    logger.debug('Running import and environment setup block of notebook.')
-    logger.debug('Configuring notebook environment with console log level of %s.', log_level)
+    logger.debug("Running import and environment setup block of notebook.")
+    logger.debug("Configuring notebook environment with console log level of %s.", log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 
 
@@ -48,12 +48,12 @@ def validate_kernel():
             'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
         )
         raise ModuleNotFoundError from module_error
-    logger.debug('Kernel validation passed. Using python from %s.', sys.executable)
+    logger.debug("Kernel validation passed. Using python from %s.", sys.executable)
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
     """Set pandas display options"""
-    logger.debug('Settings pandas display options')
+    logger.debug("Settings pandas display options")
     pd.set_option("display.max_rows", max_rows)
     pd.set_option("display.max_columns", max_columns)
     pd.set_option("display.max_colwidth", max_colwidth)
@@ -84,5 +84,5 @@ def activate_sql_logging(console_level="INFO", console_format=None, file_level="
         filename: logging destination
 
     """
-    logger.debug('Activaing SQL logging with console_level=%s and file_level=%s.', console_level, file_level)
+    logger.debug("Activaing SQL logging with console_level=%s and file_level=%s.", console_level, file_level)
     activate_module_logging("sqlalchemy.engine", console_level, console_format, file_level, filename)
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index d06b668b..c979e30d 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -62,6 +62,7 @@ def fixture_sqlite(username, change_test_dir, atlas):
     metob.store(metob.Compound())
     metob.store(metob.MzReference())
     metob.store(metob.RtReference())
+    metob.store(metob.Reference())
     metob.store(metob.LcmsRun())
     logger.debug("Done storing empty objects to create tables")
     yield
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 20fb1822..930e0a51 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -10,6 +10,7 @@
 
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.datastructures import object_helpers as metoh
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 
 
@@ -345,8 +346,10 @@ def test_store_atlas01(atlas, sqlite, username):
 
 
 def test_store_atlas02(metatlas_dataset, username):
-    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
+    atlas_list = metob.retrieve("Atlas", name=metatlas_dataset.ids.source_atlas, username=username)
     assert len(atlas_list) == 1
+    second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
+    assert len(second) == 1
     metatlas_dataset.store_atlas(even_if_exists=True)
     second = metob.retrieve("Atlas", name=metatlas_dataset.atlas.name, username=username)
     assert len(second) == 1
@@ -387,6 +390,15 @@ def test_store_atlas06(atlas, sqlite_with_atlas, username):
     assert len(second) == 1
 
 
+def test_store_atlas07(atlas, sqlite, username):
+    atlas.name = "test_store_atlas07"
+    metob.store(atlas)
+    metoh.Workspace.get_instance().close_connection()
+    metoh.Workspace.instance = None
+    atlases = metob.retrieve("Atlas", name=atlas.name, username=username)
+    assert len(atlases) == 1
+
+
 def test_analysis_identifiers01(sqlite):
     with pytest.raises(ValueError, match=r"Database does not contain an atlas.*"):
         mads.AnalysisIdentifiers(
diff --git a/tests/unit/test_metatlas_objects.py b/tests/unit/test_metatlas_objects.py
new file mode 100644
index 00000000..927df141
--- /dev/null
+++ b/tests/unit/test_metatlas_objects.py
@@ -0,0 +1,234 @@
+"""Test of metatlas objects"""
+# pylint: disable=missing-function-docstring,protected-access,unused-argument,too-many-arguments
+
+import getpass
+
+import dill
+
+from metatlas.datastructures import metatlas_objects as mo
+from metatlas.datastructures import object_helpers as metoh
+
+
+def test_clone01():
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    test2 = test.clone()
+    assert test2.unique_id != test.unique_id
+    assert test2.prev_uid == test.unique_id
+    assert test2.items[0].unique_id == test.items[0].unique_id
+
+    test3 = test.clone(True)
+    assert test3.unique_id != test.unique_id
+    assert test3.prev_uid == test.unique_id
+    assert test3.items[0].unique_id != test.items[0].unique_id
+    assert test3.items[0].prev_uid == test.items[0].unique_id
+
+
+def test_simple(sqlite):
+    test = mo.Group()
+    uid = test.unique_id
+    mo.store(test)
+    assert test.unique_id == uid
+    assert test.prev_uid != ""
+    test.name = "hello"
+    mo.store(test)
+    assert test.unique_id == uid
+    assert test.prev_uid != ""
+
+
+def test_nested(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    assert len(test.items) == 2
+    test.items[1].name = "hello"
+    orig_sub_version = test.items[1].unique_id
+    assert len(test.items) == 2
+    mo.store(test)
+    assert test.items[1].unique_id == orig_sub_version
+
+
+def test_recover(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    test.name = "howdy"
+    top_version = test.unique_id
+    sub_version = test.items[1].unique_id
+
+    mo.store(test)
+    mo.store(test)  # should have no effect
+    assert len(test.items) == 2
+    assert test.unique_id == top_version
+
+    # make sure we can recover the previous version
+    test.items = []
+    assert test.unique_id == top_version
+    test = mo.retrieve("group", unique_id=top_version)[0]
+    assert test.unique_id == top_version
+    assert len(test.items) == 2, len(test.items)
+    assert test.unique_id == top_version
+    assert test.items[1].unique_id == sub_version
+
+
+def test_unique_links(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    sub_version = test.items[1].unique_id
+    test.items = [test.items[1]]
+    mo.store(test)
+
+    test.items = []
+    test = mo.retrieve("group", unique_id=test.unique_id)[0]
+    assert len(test.items) == 1, len(test.items)
+    assert test.items[0].unique_id == sub_version
+
+
+def test_circular_reference(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    orig_id = test.unique_id
+    test.items[0].items.append(test)
+    mo.store(test)
+    test.items = []
+    test = mo.retrieve("group", unique_id=test.unique_id)[0]
+    sub0 = test.items[0]
+    assert len(sub0.items) == 2, sub0.items
+    assert sub0.items[1].unique_id == orig_id
+    assert test.unique_id == orig_id
+
+
+def test_simple_query(sqlite):
+    test1 = mo.LcmsRun(name="First")
+    first_version = test1.unique_id
+    test1.description = "Hey there"
+    mo.store(test1)
+    assert test1.unique_id == first_version
+    items = mo.retrieve("lcmsrun", name="First")
+    assert items[-1].unique_id == test1.unique_id
+    assert all((i.unique_id != first_version for i in items[:-1]))
+
+
+def test_glob_query(sqlite):
+    test1 = mo.LcmsRun(name="First")
+    test2 = mo.LcmsRun(name="Second")
+    test3 = mo.LcmsRun(name="Third")
+    mo.store([test1, test2, test3])
+    items = mo.retrieve("lcmsrun", name="Fir%")
+    assert items[-1].unique_id == test1.unique_id
+    items = mo.retrieve("lcmsrun", name="%econd")
+    assert items[-1].unique_id == test2.unique_id
+    items = mo.retrieve("LcmsRuns", name="T%ir%")
+    assert items[-1].unique_id == test3.unique_id
+
+
+def test_escape_glob(sqlite):
+    test1 = mo.LcmsRun(description="Flow %")
+    mo.store(test1)
+    items = mo.retrieve("lcmsrun", description="Flow %%")
+    assert items[-1].unique_id == test1.unique_id
+
+
+def test_id_grade_trait(sqlite):
+    id_grade = mo.IdentificationGrade(name="E")
+    mo.store(id_grade)
+    cid = mo.CompoundIdentification(identification_grade="e")
+    assert cid.identification_grade.unique_id == id_grade.unique_id
+
+
+def test_list_item_changed():
+    grp = mo.Group()
+    grp.items = []
+    grp._changed = False
+    grp.items.append(mo.Group())
+    assert grp._changed
+
+
+def test_preserve_provenance(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    test2 = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    mo.store([test, test2])
+    assert len(test.items) == 2
+    test.items = []
+    test2.items = []
+    mo.store([test, test2])
+    assert len(test.items) == 0
+    previous = mo.retrieve("group", unique_id=test.prev_uid)[0]
+    assert len(previous.items) == 2, repr(previous)
+
+
+def test_store_stubs(sqlite):
+    test = mo.Group(items=[mo.Group(items=[mo.LcmsRun()]), mo.LcmsRun()])
+    mo.store(test)
+    test = mo.retrieve("group", unique_id=test.unique_id)[0]
+    assert isinstance(test.items[0], mo.Group)
+    mo.store(test)
+
+
+def test_user_preserve(sqlite):
+    run = mo.LcmsRun(username="foo")
+    test = mo.Reference(name="hello", username="foo", lcms_run=run)
+    orig_id = test.unique_id
+    mo.store(test, _override=True)
+    assert test.unique_id == orig_id
+    mo.store(test)
+    assert test.unique_id != orig_id
+    items = mo.retrieve("reference", username="*", name="hello")
+    username = getpass.getuser()
+    assert items[-2].username == "foo"
+    assert items[-1].username == username
+    assert items[-2].lcms_run.username == "foo"
+    assert items[-1].lcms_run.username == "foo"
+
+
+def test_store_all(sqlite):
+    items = []
+    for klass in metoh.Workspace.get_instance().subclass_lut.values():
+        items.append(klass())
+    mo.store(items)
+    for klass in metoh.Workspace.get_instance().subclass_lut.values():
+        name = klass.__name__
+        assert len(mo.retrieve(name)) > 0
+
+
+def test_stub_instance(sqlite):
+    run = mo.LcmsRun(username="foo")
+    test = mo.Reference(name="hello", lcms_run=run)
+    mo.store(test, _override=True)
+    item = mo.retrieve("reference", name="hello")[0]
+    assert isinstance(item.lcms_run, mo.LcmsRun)
+
+
+def test_floating_point(sqlite):
+    compound = mo.Compound(name="foo", mono_isotopic_molecular_weight=1.0)
+    mo.store(compound)
+    compound.mono_isotopic_molecular_weight = 1.000007
+    mo.store(compound)
+    test = mo.retrieve("compound", name="foo")[-1]
+    assert test.mono_isotopic_molecular_weight == 1.000007, test.mono_isotopic_molecular_weight
+
+
+def test_remove(sqlite):
+    group = mo.Group(name="foo", items=[mo.Group(name="baz", description="hello")])
+    sub_id = group.items[0].unique_id
+    mo.store(group)
+    first = mo.retrieve("groups", unique_id=sub_id)[0]
+    assert first.unique_id == sub_id
+    mo.remove("groups", name="foo", _override=True)
+    test = mo.retrieve("groups", name="foo")
+    assert not test
+    test_sub = mo.retrieve("groups_items", target_id=sub_id)
+    assert not test_sub
+
+
+def test_remove_objects(sqlite):
+    group = mo.Group(name="foo", items=[mo.Group(name="baz", description="hello")])
+    sub_id = group.items[0].unique_id
+    mo.store(group)
+    first = mo.retrieve("groups", unique_id=sub_id)[0]
+    assert first.unique_id == sub_id
+    mo.remove_objects(group, _override=True)
+    test = mo.retrieve("groups", name="foo")
+    assert not test
+    test_sub = mo.retrieve("groups_items", target_id=sub_id)
+    assert not test_sub
+
+
+def test_dill():
+    test = mo.Group(items=[mo.Group(description="hello")])
+    blob = dill.dumps(test)
+    new = dill.loads(blob)
+    assert new.items[0].description == "hello"

From 9f572d630c5d54f28929b6a1e13b3252328609a4 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 12 Jul 2021 12:31:02 -0700
Subject: [PATCH 046/177] WIP - refactor get_msms_hits, add related tests

---
 metatlas/plots/dill2plots.py | 226 ++++++++++++-----------------------
 tests/unit/test_dill2plot.py |  64 +++++++++-
 2 files changed, 141 insertions(+), 149 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 099cc196..b3b47b07 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -21,7 +21,6 @@
 from textwrap import fill, TextWrapper
 # import qgrid
 import pandas as pd
-import os
 import dill
 import numpy as np
 import json
@@ -2055,170 +2054,101 @@ def get_refs(file_name, **kwargs):
     return ref_df
 
 
-def get_msms_hits(metatlas_dataset, use_labels=False, extra_time=False, keep_nonmatches=False,
-                  pre_query='database == "metatlas"',
-                  # pre_query = 'index == index or index == @pd.NaT',
-                  query='(@inchi_key == inchi_key) and (@polarity == polarity) and ((@precursor_mz - .5*(((.5*(@pre_mz_ppm**-decimal)/(decimal+1)) + .005 + ((.5*(@pre_mz_ppm**-decimal)/(decimal+1)) - .005)**2)**.5)) <= precursor_mz <= (@precursor_mz + .5*(((.5*(@pre_mz_ppm**-decimal)/(decimal+1)) + .005 + ((.5*(@pre_mz_ppm**-decimal)/(decimal+1)) - .005)**2)**.5)))',
-                  # query='(@inchi_key == inchi_key) and (@polarity == polarity) and ((@precursor_mz - (.5*(@pre_mz_ppm**-decimal)/(decimal+1)) - @pre_mz_ppm*(@precursor_mz*1e-6)) <= precursor_mz <= (@precursor_mz + (.5*(@pre_mz_ppm**-decimal)/(decimal+1)) + @pre_mz_ppm*(@precursor_mz*1e-6)))',
-                  # query='(@inchi_key == inchi_key) and (@polarity == polarity) and (@rt-.1 < rt < @rt+.1)  and ((@precursor_mz - (.5*(@pre_mz_ppm**-decimal)/(decimal+1)) - @pre_mz_ppm*(@precursor_mz*1e-6)) <= precursor_mz <= (@precursor_mz + (.5*(@pre_mz_ppm**-decimal)/(decimal+1)) + @pre_mz_ppm*(@precursor_mz*1e-6)))',
-                  **kwargs):
-    kwargs = dict(locals(), **kwargs)
+def convert_to_centroid(sample_df):
+    max_peaks, _ = sp.peakdet(sample_df[1], 1000.0)
+    if max_peaks.shape[0] > 0:
+        idx = max_peaks[:, 0].astype(int).flatten()
+        return sample_df[:, idx]
+    return np.zeros((0, 0))
 
-    resolve_by = kwargs.pop('resolve_by', 'shape')
-    frag_mz_tolerance = kwargs.pop('frag_mz_tolerance', .005)
 
-    # Reference parameters
-    ref_loc = kwargs.pop('ref_loc', '/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab')
+def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
+                  pre_query='database == "metatlas"', query=None, **kwargs):
+    if query is None:
+        pre_mz_decimal = ".5*(@pre_mz_ppm**-decimal)/(decimal+1)"
+        offset = f".5*(({pre_mz_decimal} + .005 + ({pre_mz_decimal} - .005)**2)**.5)"
+        query = ("(@inchi_key == inchi_key) and "
+                 "(@polarity == polarity) and "
+                 f"( (@precursor_mz - {offset}) <= precursor_mz <= (@precursor_mz + {offset}) )")
+    kwargs = dict(locals(), **kwargs)
+    ref_loc = kwargs.pop(
+            'ref_loc',
+            '/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab')
     ref_df = get_refs(ref_loc, **kwargs)
-    if 'do_centroid' in kwargs:
-        do_centroid = kwargs.pop('do_centroid')
-    else:
-        do_centroid = False
-    ref_df = ref_df.query(pre_query, local_dict=dict(locals(), **kwargs)).copy()
-
-    if ref_df['spectrum'].apply(type).eq(str).all():
-        ref_df.loc[:, 'spectrum'] = ref_df['spectrum'].apply(lambda s: eval(s)).apply(np.array)
-
+    do_centroid = kwargs.pop('do_centroid', False)
+    ref_df = ref_df.query(pre_query).copy()
+    ref_df.loc[:, 'spectrum'] = ref_df['spectrum'].apply(lambda s: np.array(json.loads(s)))
     file_names = ma_data.get_file_names(metatlas_dataset)
     compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
-
-    msms_hits = []
-
-    for compound_idx,compound_name in enumerate(compound_names):
-        sys.stdout.write('\r'+'Processing: {} / {} compounds.'.format(compound_idx+1,len(compound_names)))
+    index_cols = ref_df.index.names + ['file_name', 'msms_scan']
+    all_cols = index_cols + ['score', 'num_matches', 'msv_query_aligned', 'msv_ref_aligned', 'name', 'adduct',
+                             'inchi_key', 'precursor_mz', 'measured_precursor_mz',
+                             'measured_precursor_intensity']
+    msms_hits = pd.DataFrame(columns=all_cols).set_index(index_cols)
+    for compound_idx, compound_name in enumerate(compound_names):
+        sys.stdout.write('\r'+'Processing: {} / {} compounds.'.format(compound_idx+1, len(compound_names)))
         sys.stdout.flush()
-
-        #Code below is commented out to make get_msms_hits work when there isn't a compound in identification - VS, Nov 2019
-        #if len(metatlas_dataset[0][compound_idx]['identification'].compound) == 0:
-            # exit here if there isn't a compound in the identification
-        #    continue
-
-        if metatlas_dataset[0][compound_idx]['identification'].name:
-            name = metatlas_dataset[0][compound_idx]['identification'].name.split('///')[0]
-        elif metatlas_dataset[0][compound_idx]['identification'].compound[-1].name:
-            name = metatlas_dataset[0][compound_idx]['identification'].compound[-1].name
-        else:
-            name = None
-
-        try:
-            adduct = metatlas_dataset[0][compound_idx]['identification'].mz_references[0].adduct
-        except (KeyError, AttributeError):
-            adduct = None
-
-        if len(metatlas_dataset[0][compound_idx]['identification'].compound) > 0:
-            inchi_key = metatlas_dataset[0][compound_idx]['identification'].compound[0].inchi_key
-        else:
-            inchi_key = ''
-        pre_mz_ppm = metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance
-        precursor_mz = metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz
-
-        rt_min = metatlas_dataset[0][compound_idx]['identification'].rt_references[0].rt_min
-        rt_max = metatlas_dataset[0][compound_idx]['identification'].rt_references[0].rt_max
-
+        cid = metatlas_dataset[0][compound_idx]['identification']
+        name = cid.name.split('///')[0] if cid.name else getattr(cid.compound[-1], 'name', None)
+        adduct = ma_data.extract(cid, ['mz_references', 0, 'adduct'], None)
+        inchi_key = ma_data.extract(cid, ['compound', 0, 'inchi_key'], '')
+        pre_mz_ppm = cid.mz_references[0].mz_tolerance
+        precursor_mz = cid.mz_references[0].mz
+        rt_min = cid.rt_references[0].rt_min
+        rt_max = cid.rt_references[0].rt_max
         compound_hits = []
-
-        for file_idx,file_name in enumerate(file_names):
-
-            polarity = metatlas_dataset[file_idx][compound_idx]['identification'].mz_references[0].detected_polarity
-
+        for file_idx, file_name in enumerate(file_names):
+            mfc = metatlas_dataset[file_idx][compound_idx]
+            polarity = mfc['identification'].mz_references[0].detected_polarity
             try:
-                assert set(['rt', 'i', 'precursor_MZ', 'mz']).issubset(set(metatlas_dataset[file_idx][compound_idx]['data']['msms']['data'].keys()))
+                assert {'rt', 'i', 'precursor_MZ', 'mz'}.issubset(set(mfc['data']['msms']['data'].keys()))
             except (KeyError, AssertionError, AttributeError):
                 continue
-
-            rt_mz_i_df = pd.DataFrame({k:metatlas_dataset[file_idx][compound_idx]['data']['msms']['data'][k]
+            rt_mz_i_df = pd.DataFrame({k: mfc['data']['msms']['data'][k]
                                       for k in ['rt', 'mz', 'i', 'precursor_MZ', 'precursor_intensity']}
                                       ).sort_values(['rt', 'mz'])
-
-            for rt in rt_mz_i_df.rt.unique():
-                if not extra_time:
-                    if not rt_min <= rt <= rt_max:
-                        continue
-
-                msv_sample = rt_mz_i_df.loc[rt_mz_i_df['rt'] == rt,['mz', 'i','rt','precursor_MZ','precursor_intensity']]
+            for msms_scan in rt_mz_i_df.rt.unique():
+                if not extra_time and not rt_min <= msms_scan <= rt_max:
+                    continue
+                msv_sample = rt_mz_i_df.loc[rt_mz_i_df['rt'] == msms_scan,
+                                            ['mz', 'i', 'rt', 'precursor_MZ', 'precursor_intensity']]
                 precursor_mz_sample = msv_sample['precursor_MZ'].values[0]
                 precursor_intensity_sample = msv_sample['precursor_intensity'].values[0]
-                msv_sample = msv_sample[['mz','i']].values.T
-
-                if do_centroid:
-                    max_peaks, min_peaks = sp.peakdet(msv_sample[1], 1000.0)
-                    if max_peaks.shape[0]>0:
-                        idx = max_peaks[:,0].astype(int).flatten()
-                        msv_sample = msv_sample[:,idx]
-                    else:
-                        msv_sample = np.zeros((0,0))
-#                 msv_sample.sort_values('mz',inplace=True)
-#                 msv_sample = msv_sample
-
-                #Filter ions greater than 2.5 + precursor M/Z
-                msv_sample = msv_sample[:,msv_sample[0] < precursor_mz_sample + 2.5]
-                if msv_sample.size > 0:
-                    scan_df = sp.search_ms_refs(msv_sample, **dict(locals(), **kwargs))
-                else:
-                    scan_df = {}
-
-                if len(scan_df) > 0:
-                    scan_df['file_name'] = file_name
-                    scan_df['msms_scan'] = rt
-                    scan_df['name'] = name
-                    scan_df['adduct'] = adduct
-                    scan_df['inchi_key'] = inchi_key
-                    scan_df['precursor_mz'] = precursor_mz
-                    scan_df['measured_precursor_mz'] = precursor_mz_sample
-                    scan_df['measured_precursor_intensity'] = precursor_intensity_sample
-
-                    scan_df.set_index('file_name', append=True, inplace=True)
-                    scan_df.set_index('msms_scan', append=True, inplace=True)
-
-                    msms_hits.append(scan_df)
-
-                elif keep_nonmatches:
-                    scan_df = {}
-
-                    scan_df['file_name'] = file_name
-                    scan_df['msms_scan'] = rt
-                    scan_df['name'] = name
-                    scan_df['adduct'] = adduct
-                    scan_df['inchi_key'] = inchi_key
-                    scan_df['precursor_mz'] = precursor_mz
+                msv_sample.sort_values('mz', inplace=True)
+                msv_sample = msv_sample[['mz', 'i']].values.T
+                msv_sample = convert_to_centroid(msv_sample) if do_centroid else msv_sample
+                # Filter ions greater than 2.5 + precursor M/Z
+                msv_sample = msv_sample[:, msv_sample[0] < precursor_mz_sample + 2.5]
+                kwargs = dict(locals(), **kwargs)
+                scan_df = sp.search_ms_refs(msv_sample, **kwargs) if msv_sample.size > 0 else pd.DataFrame()
+                hits = len(scan_df) > 0
+                if not hits and not keep_nonmatches:
+                    continue
+                if not hits and keep_nonmatches:
+                    scan_df = pd.DataFrame(
+                                    data={'database': [np.nan], 'id': [np.nan]},
+                                    index=pd.MultiIndex.from_tuples(
+                                        [(np.nan, np.nan)],
+                                        names=['database', 'id']),
+                                    columns=all_cols[2:]  # leave out the cols that are used in the index
+                                    )
+                scan_df['file_name'] = file_name
+                scan_df['msms_scan'] = msms_scan
+                scan_df['name'] = name
+                scan_df['adduct'] = adduct
+                scan_df['inchi_key'] = inchi_key
+                scan_df['precursor_mz'] = precursor_mz
+                scan_df['measured_precursor_mz'] = precursor_mz_sample
+                scan_df['measured_precursor_intensity'] = precursor_intensity_sample
+                scan_df.set_index(['file_name', 'msms_scan'], append=True, inplace=True)
+                if not hits and keep_nonmatches:
                     scan_df['num_matches'] = 0
-                    scan_df['measured_precursor_mz'] = precursor_mz_sample
-                    scan_df['measured_precursor_intensity'] = precursor_intensity_sample
-
                     scan_df['score'] = precursor_intensity_sample
-                    scan_df['msv_query_aligned'] = msv_sample
-                    scan_df['msv_ref_aligned'] = np.full_like(msv_sample, np.nan)
-
-                    scan_df = pd.DataFrame([scan_df])
-
-                    for idx in ref_df.index.names:
-                        scan_df[idx] = None
-                    scan_df.set_index('database', append=False, inplace=True)
-                    scan_df.set_index('id', append=True, inplace=True)
-                    scan_df.set_index('file_name', append=True, inplace=True)
-                    scan_df.set_index('msms_scan', append=True, inplace=True)
-
-                    msms_hits.append(scan_df)
-
+                    scan_df['msv_query_aligned'] = [msv_sample]
+                    scan_df['msv_ref_aligned'] = [np.full_like(msv_sample, np.nan)]
+                msms_hits = msms_hits.append(scan_df)
     sys.stdout.write('\n'+'Done!!!\n')
-    if len(msms_hits)>0:
-        hits = pd.concat(msms_hits)
-        return hits
-        #Check if number of matches for a compound across all files is 1 or less and set the score to its maximum intensity.
-        #This will identify MSMS with single ion / no fragmentation
-#         print(hits.groupby(['inchi_key', 'adduct'])['num_matches'])
-#         if keep_nonmatches==True:
-#             idxs = hits.groupby(['inchi_key', 'adduct'])['num_matches'].transform(max) <= 1
-#             hits['score'][idxs] = hits['measured_precursor_intensity'][idxs]
-        g = hits.groupby(['inchi_key', 'adduct'])['num_matches'].transform(max)
-        idxs =  g<= 1
-        proc_idx = g[idxs].index
-        for i in proc_idx:
-            hits.loc[i,'score'] = hits.loc[i,'measured_precursor_intensity']
-        return hits
-    else:
-        return pd.DataFrame(columns=ref_df.index.names+['file_name', 'msms_scan', 'score', 'num_matches','inchi_key','precursor_mz','adduct','score']
-                           ).set_index(ref_df.index.names+['file_name', 'msms_scan'])
+    return msms_hits
 
 
 def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], group='index', share_y=True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity='', overwrite=False):
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index 3989ff6b..5d56ebeb 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -103,5 +103,67 @@ def test_strong_signal_compound_idxs(metatlas_dataset):
 def test_get_msms_hits01(metatlas_dataset, msms_refs, mocker):
     mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
     hits = dill2plots.get_msms_hits(metatlas_dataset)
-    expected = """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1090698242,null,252.1557617188],[null,null,null,null,null,null,null,null,null,null,null,null,null,76976.7265625,null,6090.6440429688]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"2\'-deoxyadenosine","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    expected = (
+        """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1090698242,null,252.1557617188],[null,null,null,null,null,null,null,null,null,null,null,null,null,76976.7265625,null,6090.6440429688]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', """
+        """2.2203779221)":"2\'-deoxyadenosine","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    )
+    assert len(hits) == 2
     assert expected == hits.to_json()
+
+
+def test_get_msms_hits02(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
+    metatlas_dataset.set_rt(0, "rt_max", 2.3)  # reduce the RT bounds to so that only one hit falls within
+    hits_small = dill2plots.get_msms_hits(metatlas_dataset)
+    expected_small = """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5}}"""
+    assert len(hits_small) == 1
+    assert hits_small.to_json() == expected_small
+
+
+def test_get_msms_hits03(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
+    metatlas_dataset.set_rt(0, "rt_max", 2.3)  # reduce the RT bounds to so that only one hit falls within
+    hits_large = dill2plots.get_msms_hits(
+        metatlas_dataset, extra_time=0.75
+    )  # with extra_time the second hits is also included
+    expected_large = (
+        """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1090698242,null,252.1557617188],[null,null,null,null,null,null,null,null,null,null,null,null,null,76976.7265625,null,6090.6440429688]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', """
+        """2.2203779221)":"2\'-deoxyadenosine","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    )
+    assert len(hits_large) == 2
+    assert hits_large.to_json() == expected_large
+
+
+def test_get_msms_hits04(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
+    metatlas_dataset.set_rt(0, "rt_max", 2.20)  # reduce the RT bounds to so that no hits fall within
+    hits = dill2plots.get_msms_hits(metatlas_dataset)
+    assert len(hits) == 0
+
+
+def test_get_msms_hits05(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs)
+    expected = (
+        """{"score":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0.0,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0.0},"num_matches":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":1,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":1},"msv_query_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1087036133,null,252.1572875977],[null,null,null,null,null,null,null,null,null,null,null,null,null,93112.0859375,null,7624.11328125]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null,null,null,null,null,null,null,null,null,null,null,null,252.1090698242,null,252.1557617188],[null,null,null,null,null,null,null,null,null,null,null,null,null,76976.7265625,null,6090.6440429688]]},"msv_ref_aligned":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]],"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[57.0345,63.3177,63.3205,69.0344,71.0499,73.0292,84.9778,99.0447,117.055,118.059,136.062,137.066,236.709,252.109,253.112,null],[176328.0,328818.0,274432.0,197637.0,896360.0,1192020.0,378547.0,3921880.0,15737700.0,266131.0,144220000.0,3455270.0,185227.0,20960800.0,1284450.0,null]]},"name":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', """
+        """2.2203779221)":"2\'-deoxyadenosine","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.1091393,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(\'metatlas\', \'c7dddd297e104ca79caea72a90150532\', \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    )
+    hits = dill2plots.get_msms_hits(metatlas_dataset, keep_nonmatches=True)
+    assert len(hits) == 2
+    assert hits.to_json() == expected
+
+
+def test_get_msms_hits06(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs.iloc[0:0])
+    expected = (
+        """{"score":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75},"num_matches":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":0,"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":0},"msv_query_aligned":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[252.1087036133,252.1572875977],[93112.0859375,7624.11328125]],"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[252.1090698242,252.1557617188],[76976.7265625,6090.6440429688]]},"msv_ref_aligned":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":[[null,null],[null,null]],"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":[[null,null],[null,null]]},"name":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"2\'-deoxyadenosine","(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"2\'-deoxyadenosine"},"adduct":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"[M+H]+","(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"[M+H]+"},"inchi_key":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"precursor_mz":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221"""
+        """)":252.1091393,"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1091393},"measured_precursor_mz":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":252.10887146,"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":252.1089477539},"measured_precursor_intensity":{"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.2203779221)":2872807.5,"(nan, nan, \'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\', 2.3452186584)":3046732.75}}"""
+    )
+    hits = dill2plots.get_msms_hits(metatlas_dataset, keep_nonmatches=True)
+    assert len(hits) == 2
+    assert hits.to_json() == expected
+
+
+def test_get_msms_hits07(metatlas_dataset, msms_refs, mocker):
+    mocker.patch("metatlas.plots.dill2plots.get_refs", return_value=msms_refs.iloc[0:0])
+    hits = dill2plots.get_msms_hits(metatlas_dataset)
+    assert len(hits) == 0

From 2270e613da9c4a0dee20421b7a563c6f4b7ac98d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 14 Jul 2021 11:58:03 -0700
Subject: [PATCH 047/177] refactoring of get_msms_hits()

---
 metatlas/plots/dill2plots.py | 109 ++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 48 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index b3b47b07..5e4d7a52 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -2033,25 +2033,12 @@ def plot_score_and_ref_file(ax, score, rt, ref):
         transform=ax.transAxes)
 
 
-def get_refs(file_name, **kwargs):
+def get_refs(file_name, ref_dtypes, ref_index):
     """Load msms refs from file_name, returns pandas Dataframe"""
-    # Reference parameters
-    ref_dtypes = kwargs.pop('ref_dtypes', {'database': str, 'id': str, 'name': str,
-                                           'spectrum': object, 'decimal': int, 'precursor_mz': float,
-                                           'polarity': str, 'adduct': str, 'fragmentation_method': str,
-                                           'collision_energy': str, 'instrument': str, 'instrument_type': str,
-                                           'formula': str, 'exact_mass': float,
-                                           'inchi_key': str, 'inchi': str, 'smiles': str})
-
-    ref_index = kwargs.pop('ref_index', ['database', 'id'])
-    if 'ref_df' in kwargs:
-        ref_df = kwargs.pop('ref_df')
-    else:
-        ref_df = pd.read_csv(file_name,
-                             sep='\t',
-                             dtype=ref_dtypes
-                             ).set_index(ref_index)
-    return ref_df
+    return pd.read_csv(file_name,
+                       sep='\t',
+                       dtype=ref_dtypes
+                       ).set_index(ref_index)
 
 
 def convert_to_centroid(sample_df):
@@ -2062,20 +2049,57 @@ def convert_to_centroid(sample_df):
     return np.zeros((0, 0))
 
 
+def search_ms_refs(msv_sample, query, inchi_key, polarity, precursor_mz, pre_mz_ppm, frag_mz_tolerance, ref_loc, ref_dtypes, ref_index, ref_df):
+    return sp.search_ms_refs(msv_sample, **locals())
+
+
+def get_msms_hits_per_compound(rt_mz_i_df, msms_scan, do_centroid, query, inchi_key, polarity,
+                               precursor_mz, pre_mz_ppm, frag_mz_tolerance, ref_loc, ref_dtypes,
+                               ref_index, ref_df):
+    msv_sample = rt_mz_i_df.loc[rt_mz_i_df['rt'] == msms_scan,
+                                ['mz', 'i', 'rt', 'precursor_MZ', 'precursor_intensity']]
+    precursor_mz_sample = msv_sample['precursor_MZ'].values[0]
+    msv_sample.sort_values('mz', inplace=True)
+    msv_sample = msv_sample[['mz', 'i']].values.T
+    msv_sample = convert_to_centroid(msv_sample) if do_centroid else msv_sample
+    # Filter ions greater than 2.5 + precursor M/Z
+    msv_sample = msv_sample[:, msv_sample[0] < precursor_mz_sample + 2.5]
+    if msv_sample.size > 0:
+        return search_ms_refs(msv_sample, query, inchi_key, polarity, precursor_mz,
+                              pre_mz_ppm, frag_mz_tolerance, ref_loc, ref_dtypes,
+                              ref_index, ref_df), msv_sample
+    return pd.DataFrame(), msv_sample
+
+
+def get_empty_scan_df(columns):
+    return pd.DataFrame(data={'database': [np.nan], 'id': [np.nan]},
+                        index=pd.MultiIndex.from_tuples([(np.nan, np.nan)], names=['database', 'id']),
+                        columns=columns)
+
+
 def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
-                  pre_query='database == "metatlas"', query=None, **kwargs):
+                  pre_query='database == "metatlas"', query=None, ref_dtypes=None,
+                  ref_loc=None, ref_df=None, frag_mz_tolerance=.005, ref_index=None,
+                  do_centroid=False):
     if query is None:
         pre_mz_decimal = ".5*(@pre_mz_ppm**-decimal)/(decimal+1)"
         offset = f".5*(({pre_mz_decimal} + .005 + ({pre_mz_decimal} - .005)**2)**.5)"
         query = ("(@inchi_key == inchi_key) and "
                  "(@polarity == polarity) and "
                  f"( (@precursor_mz - {offset}) <= precursor_mz <= (@precursor_mz + {offset}) )")
-    kwargs = dict(locals(), **kwargs)
-    ref_loc = kwargs.pop(
-            'ref_loc',
-            '/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab')
-    ref_df = get_refs(ref_loc, **kwargs)
-    do_centroid = kwargs.pop('do_centroid', False)
+    if ref_dtypes is None:
+        ref_dtypes = {'database': str, 'id': str, 'name': str,
+                      'spectrum': object, 'decimal': int, 'precursor_mz': float,
+                      'polarity': str, 'adduct': str, 'fragmentation_method': str,
+                      'collision_energy': str, 'instrument': str, 'instrument_type': str,
+                      'formula': str, 'exact_mass': float,
+                      'inchi_key': str, 'inchi': str, 'smiles': str}
+    if ref_index is None:
+        ref_index = ['database', 'id']
+    if ref_loc is None:
+        ref_loc = '/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab'
+    if ref_df is None:
+        ref_df = get_refs(ref_loc, ref_dtypes, ref_index)
     ref_df = ref_df.query(pre_query).copy()
     ref_df.loc[:, 'spectrum'] = ref_df['spectrum'].apply(lambda s: np.array(json.loads(s)))
     file_names = ma_data.get_file_names(metatlas_dataset)
@@ -2085,7 +2109,7 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
                              'inchi_key', 'precursor_mz', 'measured_precursor_mz',
                              'measured_precursor_intensity']
     msms_hits = pd.DataFrame(columns=all_cols).set_index(index_cols)
-    for compound_idx, compound_name in enumerate(compound_names):
+    for compound_idx, _ in enumerate(compound_names):
         sys.stdout.write('\r'+'Processing: {} / {} compounds.'.format(compound_idx+1, len(compound_names)))
         sys.stdout.flush()
         cid = metatlas_dataset[0][compound_idx]['identification']
@@ -2096,7 +2120,6 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
         precursor_mz = cid.mz_references[0].mz
         rt_min = cid.rt_references[0].rt_min
         rt_max = cid.rt_references[0].rt_max
-        compound_hits = []
         for file_idx, file_name in enumerate(file_names):
             mfc = metatlas_dataset[file_idx][compound_idx]
             polarity = mfc['identification'].mz_references[0].detected_polarity
@@ -2110,40 +2133,30 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
             for msms_scan in rt_mz_i_df.rt.unique():
                 if not extra_time and not rt_min <= msms_scan <= rt_max:
                     continue
-                msv_sample = rt_mz_i_df.loc[rt_mz_i_df['rt'] == msms_scan,
-                                            ['mz', 'i', 'rt', 'precursor_MZ', 'precursor_intensity']]
-                precursor_mz_sample = msv_sample['precursor_MZ'].values[0]
-                precursor_intensity_sample = msv_sample['precursor_intensity'].values[0]
-                msv_sample.sort_values('mz', inplace=True)
-                msv_sample = msv_sample[['mz', 'i']].values.T
-                msv_sample = convert_to_centroid(msv_sample) if do_centroid else msv_sample
-                # Filter ions greater than 2.5 + precursor M/Z
-                msv_sample = msv_sample[:, msv_sample[0] < precursor_mz_sample + 2.5]
-                kwargs = dict(locals(), **kwargs)
-                scan_df = sp.search_ms_refs(msv_sample, **kwargs) if msv_sample.size > 0 else pd.DataFrame()
+                scan_df, msv_sample = get_msms_hits_per_compound(rt_mz_i_df, msms_scan, do_centroid,
+                                                                 query, inchi_key, polarity,
+                                                                 precursor_mz, pre_mz_ppm,
+                                                                 frag_mz_tolerance, ref_loc, ref_dtypes,
+                                                                 ref_index, ref_df)
+                precursor = rt_mz_i_df.loc[rt_mz_i_df['rt'] == msms_scan, ['precursor_MZ', 'precursor_intensity']]
                 hits = len(scan_df) > 0
                 if not hits and not keep_nonmatches:
                     continue
                 if not hits and keep_nonmatches:
-                    scan_df = pd.DataFrame(
-                                    data={'database': [np.nan], 'id': [np.nan]},
-                                    index=pd.MultiIndex.from_tuples(
-                                        [(np.nan, np.nan)],
-                                        names=['database', 'id']),
-                                    columns=all_cols[2:]  # leave out the cols that are used in the index
-                                    )
+                    # leave out the cols that are used in the index
+                    scan_df = get_empty_scan_df(all_cols[2:])
                 scan_df['file_name'] = file_name
                 scan_df['msms_scan'] = msms_scan
                 scan_df['name'] = name
                 scan_df['adduct'] = adduct
                 scan_df['inchi_key'] = inchi_key
                 scan_df['precursor_mz'] = precursor_mz
-                scan_df['measured_precursor_mz'] = precursor_mz_sample
-                scan_df['measured_precursor_intensity'] = precursor_intensity_sample
+                scan_df['measured_precursor_mz'] = precursor['precursor_MZ'].values[0]
+                scan_df['measured_precursor_intensity'] = precursor['precursor_intensity'].values[0]
                 scan_df.set_index(['file_name', 'msms_scan'], append=True, inplace=True)
                 if not hits and keep_nonmatches:
                     scan_df['num_matches'] = 0
-                    scan_df['score'] = precursor_intensity_sample
+                    scan_df['score'] = precursor['precursor_intensity'].values[0]
                     scan_df['msv_query_aligned'] = [msv_sample]
                     scan_df['msv_ref_aligned'] = [np.full_like(msv_sample, np.nan)]
                 msms_hits = msms_hits.append(scan_df)

From 028d37a0fa8c368ff690b8f07b7c56a1f32e5483 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 15 Jul 2021 10:01:43 -0700
Subject: [PATCH 048/177] logging improvement

---
 metatlas/datastructures/object_helpers.py | 68 ++++++++++++++---------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/metatlas/datastructures/object_helpers.py b/metatlas/datastructures/object_helpers.py
index 6515cadf..88f1479c 100644
--- a/metatlas/datastructures/object_helpers.py
+++ b/metatlas/datastructures/object_helpers.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
+import inspect
 import logging
 import sys
 import os
@@ -168,6 +169,7 @@ def __init__(self):
 
     @classmethod
     def get_instance(cls):
+        """Returns a existing instance of Workspace or creates a new one"""
         if Workspace.instance is None:
             return Workspace()
         return Workspace.instance
@@ -177,15 +179,20 @@ def get_connection(self):
         Get a re-useable connection to the database.
         Each activity that queries the database needs to have this function preceeding it.
         """
-        try:
-            self.db.begin()
-            self.db.query('SELECT 1')
-            self.db.commit()
-        except Exception:
+        if self.db is None:
             self.db = dataset.connect(self.path)
+        else:
+            self.db.begin()
+            try:
+                self.db.query('SELECT 1')
+                self.db.commit()
+            except Exception:
+                self.db.rollback()
+                self.db = dataset.connect(self.path)
         assert self.db is not None
 
     def close_connection(self):
+        """close database connections"""
         if self.db is not None:
             self.db.close()
             self.db = None
@@ -197,10 +204,8 @@ def convert_to_double(self, table, entry):
         try:
             self.db.query('alter table `%s` modify `%s` double' % (table, entry))
             self.db.commit()
-        except Exception as e:
-            self.db.rollback()
-            print(e)
-            logger.error('Transaction rollback within convert_to_double()')
+        except Exception as err:
+            rollback_and_log(self.db, err)
 
     def save_objects(self, objects, _override=False):
         """Save objects to the database"""
@@ -213,7 +218,12 @@ def save_objects(self, objects, _override=False):
         self._inserts = defaultdict(list)
         for obj in objects:
             self._get_save_data(obj, _override)
-        logger.debug('Workspace._inserts=%s', self._inserts)
+        if self._inserts:
+            logger.debug('Workspace._inserts=%s', self._inserts)
+        if self._updates:
+            logger.debug('Workspace._updates=%s', self._updates)
+        if self._link_updates:
+            logger.debug('Workspace._link_updates=%s', self._link_updates)
         self.get_connection()
         self.db.begin()
         try:
@@ -241,9 +251,8 @@ def save_objects(self, objects, _override=False):
                 self.db[table_name].insert_many(inserts)
                 logger.debug('inserting %s', inserts)
             self.db.commit()
-        except Exception:
-            self.db.rollback()
-            logger.error('Transaction rollback within save_objects()')
+        except Exception as err:
+            rollback_and_log(self.db, err)
 
     def create_link_tables(self, klass):
         """
@@ -264,9 +273,8 @@ def create_link_tables(self, klass):
                                     target_table=uuid.uuid4().hex)
                         self.db[table_name].insert(link)
             self.db.commit()
-        except Exception:
-            self.db.rollback()
-            logger.error('Transaction rollback within create_link_tables()')
+        except Exception as err:
+            rollback_and_log(self.db, err)
 
     def _get_save_data(self, obj, override=False):
         """Get the data that will be used to save an object to the database"""
@@ -418,10 +426,7 @@ def retrieve(self, object_type, **kwargs):
             items.sort(key=lambda x: x.last_modified)
             self.db.commit()
         except Exception as err:
-            logger.exception(err)
-            self.db.rollback()
-            raise err
-            logger.error('Transaction rollback within retrieve()')
+            rollback_and_log(self.db, err)
         return items
 
     def remove(self, object_type, **kwargs):
@@ -488,9 +493,8 @@ def remove(self, object_type, **kwargs):
                     raise e
             print('Removed')
             self.db.commit()
-        except Exception:
-            self.db.rollback()
-            logger.error('Transaction rollback within retrieve()')
+        except Exception as err:
+            rollback_and_log(self.db, err)
 
     def remove_objects(self, objects, all_versions=True, **kwargs):
         """Remove a list of objects from the database."""
@@ -533,9 +537,8 @@ def remove_objects(self, objects, all_versions=True, **kwargs):
                 self.db.query(query)
             print(('Removed %s object(s)' % len(objects)))
             self.db.commit()
-        except Exception:
-            self.db.rollback()
-            logger.error('Transaction rollback within remove_objects()')
+        except Exception as err:
+            rollback_and_log(self.db, err)
 
 
 def format_timestamp(tstamp):
@@ -641,3 +644,16 @@ def get_from_nersc(user, relative_path):
     proc.expect('Download Complete')
     proc.close()
     return os.path.abspath(os.path.basename(relative_path))
+
+
+def rollback_and_log(db_connection, err):
+    """
+    inputs:
+        db_connection: a dataset instance in a transaction that needs to be rolled back
+        err: exception instance that ended the transaction
+    """
+    caller_name = inspect.stack()[1][3]
+    db_connection.rollback()
+    logger.error("Transaction rollback within %s()", caller_name)
+    logger.exception(err)
+    raise err

From e7d478b9fa53fb9a579df55d8ce8411a70a8ff14 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 15 Jul 2021 10:02:25 -0700
Subject: [PATCH 049/177] filter_compounds now updates atlas in db

---
 metatlas/datastructures/metatlas_dataset.py | 58 ++++++++++++---------
 tests/unit/test_metatlas_dataset.py         | 11 ++++
 2 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index c7ed7238..543fc44b 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -15,6 +15,7 @@
 import tqdm
 
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.datastructures import object_helpers as metoh
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 from metatlas.io import targeted_output
 from metatlas.io import write_utils
@@ -300,21 +301,41 @@ def _build(self):
             _duration_since(start_time),
         )
 
-    def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
+    def _remove_compound_id(self, idx):
+        """
+        Remove compound identification at index idx from both in db and self._atlas
+        Does not invalidate _data or _hits or _atlas_df
+        This bypasses several ORM layers and therefore is a hack, but I couldn't get it to work with the ORM.
+        """
+        cid_id = self._atlas.compound_identifications[idx].unique_id
+        atlas_id = self._atlas.unique_id
+        link_table = "atlases_compound_identifications"
+        target = f"target_id='{cid_id}'"
+        workspace = metob.Workspace.get_instance()
+        workspace.get_connection()
+        workspace.db.begin()
+        try:
+            workspace.db.query(f"delete from {link_table} where ({target} and source_id='{atlas_id}')")
+            links = workspace.db.query(f"select source_id from {link_table} where {target}")
+            if len(list(links)) == 0:  # other atlases are not linked to this CompoundIdentification
+                workspace.db.query(f"delete from compoundidentifications where unique_id='{cid_id}'")
+            workspace.db.commit()
+            del self._atlas.compound_identifications[idx]
+        except Exception as err:
+            metoh.rollback_and_log(workspace.db, err)
+        workspace.close_connection()
+
+    def filter_compounds(self, keep_idxs=None, remove_idxs=None):
         """
         inputs:
             keep_idxs: the indexes of compounds to keep
             remove_idxs: the indexes of compounds to remove
                 Exactly one of keep_idxs or remove_idxs must be None
-            name: the name for the new atlas, defaults to current name
         output:
             If keep_idxs is not None then update self.atlas to contain only the compound_identifications at
             keep_idxs. If remove_idxs is not None then update self.atlas to contain only the compound
             identifications not at remove_idxs. Raises ValueError if both keep_idxs and remove_idxs are None.
 
-            There is an additional side effect that all mz_tolerances in the returned atlas
-            get their value from self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
-
             Does not invalidate _data or _hits
         """
         if (keep_idxs is None) == (remove_idxs is None):
@@ -325,20 +346,14 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None, name=None):
             keep_idxs = self.atlas_df.index.difference(remove_idxs)
         self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
         self._atlas_df_valid = True
-        name = self.atlas.name if name is None else name
-        mz_tolerance = self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
         if self._data_valid:
             self._data = [
                 [compound for idx, compound in enumerate(sample) if idx in keep_idxs] for sample in self._data
             ]
-        self._atlas = dp.make_atlas_from_spreadsheet(
-            self.atlas_df,
-            name,
-            filetype="dataframe",
-            polarity=self.polarity,
-            store=False,
-            mz_tolerance=mz_tolerance,
-        )
+        if remove_idxs is None:
+            remove_idxs = [idx for idx, _ in enumerate(self._atlas.compound_identifications)
+                           if idx not in keep_idxs]
+        _ = [self._remove_compound_id(idx) for idx in sorted(remove_idxs, reverse=True)]
         logger.info(
             "Filtering reduced atlas from %d to %d compounds (%d removed).",
             start_len,
@@ -367,10 +382,8 @@ def filter_hits_by_atlas(self):
             start_len - len(self.hits),
         )
 
-    def filter_compounds_ms1_notes_remove(self, name=None):
+    def filter_compounds_ms1_notes_remove(self):
         """
-        inputs:
-            name: the name for the new atlas, defaults to current name
         output:
             updates self.atlas to contain only the compound_identifications that do not have ms1_notes
             starting with 'remove' (case insensitive)
@@ -378,22 +391,19 @@ def filter_compounds_ms1_notes_remove(self, name=None):
             get their value from self.atlas.compound_identifications[0].mz_references[0].mz_tolerance
         """
         logger.debug("Filtering atlas to exclude ms1_notes=='remove'.")
-        name = self.atlas.name if name is None else name
-        self.filter_compounds(remove_idxs=self.compound_indices_marked_remove(), name=name)
+        self.filter_compounds(remove_idxs=self.compound_indices_marked_remove())
 
-    def filter_compounds_by_signal(self, num_points, peak_height, name=None):
+    def filter_compounds_by_signal(self, num_points, peak_height):
         """
         inputs:
             num_points: number of points in EIC that must be exceeded in one or more samples
                         in order for the compound to remain in the atlas
             peak_height: max intensity in the EIC that must be exceeded in one or more samples
                          in order for the compound to remain in the atlas
-            name: the name for the new atlas, defaults to current name
         """
         logger.debug("Filtering atlas on num_points=%d, peak_height=%d.", num_points, peak_height)
-        name = self.atlas.name if name is None else name
         keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
-        self.filter_compounds(keep_idxs=keep_idxs, name=name)
+        self.filter_compounds(keep_idxs=keep_idxs)
 
     def store_atlas(self, even_if_exists=False):
         """
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 930e0a51..7a044ad6 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -78,6 +78,17 @@ def test_filter_compounds04(mocker, metatlas_dataset, compound):
         metatlas_dataset.filter_compounds(remove_idxs=[999])
 
 
+def test_filter_compounds05(mocker, metatlas_dataset_with_2_cids, username):
+    original_rt_min = metatlas_dataset_with_2_cids.rts[1].rt_min
+    print([r.rt_min for r in metatlas_dataset_with_2_cids.rts])
+    updated_rt_min = 9.99
+    metatlas_dataset_with_2_cids.set_rt(1, "rt_min", updated_rt_min)
+    metatlas_dataset_with_2_cids.filter_compounds(remove_idxs=[0])
+    atlas = metob.retrieve("Atlas", name=metatlas_dataset_with_2_cids.atlas.name, username=username)[0]
+    assert atlas.compound_identifications[0].rt_references[0].rt_min != original_rt_min
+    assert atlas.compound_identifications[0].rt_references[0].rt_min == updated_rt_min
+
+
 def test_filter_hits_by_atlas01(mocker, metatlas_dataset_with_2_cids, hits, compound):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])

From 16d8a39733f9b422554eb356c42502cb08a9ba5a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 15 Jul 2021 16:20:38 -0700
Subject: [PATCH 050/177] Fix default for include_groups in RT_Prediction

---
 notebooks/reference/RT_Prediction.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 0952c263..27c8f93e 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -30,7 +30,7 @@
     "experiment = \"REPLACE ME\"\n",
     "\n",
     "# group will only be used in RT prediction if their name has a substring match to this list of strings\n",
-    "include_groups = [\"S1\"]\n",
+    "include_groups = [\"QC\"]\n",
     "\n",
     "# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']\n",
     "exclude_files = []\n",

From 1894c3d6c1823bb5488682ab34397aec8c1e2647 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 20 Jul 2021 14:28:27 -0700
Subject: [PATCH 051/177] Add scripts to make conda env

---
 conda/create_conda_env.sh                     | 37 +++++++++++++++++++
 conda/requirements.txt                        |  1 +
 docker/metatlas_env.yaml                      | 30 ---------------
 .../kernels/metatlas-targeted.kernel.json     |  2 +-
 4 files changed, 39 insertions(+), 31 deletions(-)
 create mode 100755 conda/create_conda_env.sh
 create mode 120000 conda/requirements.txt
 delete mode 100644 docker/metatlas_env.yaml

diff --git a/conda/create_conda_env.sh b/conda/create_conda_env.sh
new file mode 100755
index 00000000..944884eb
--- /dev/null
+++ b/conda/create_conda_env.sh
@@ -0,0 +1,37 @@
+#!/bin/bash --login
+set -ef -o pipefail
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+REPO_DIR="$(dirname "$SCRIPT_DIR")"
+NAME="metatlas-targeted-$(date --iso-8601)"
+BASE_DIR="/global/common/software/m2650"
+ENV_DIR="${BASE_DIR}/${NAME}"
+ENV_FILE="${SCRIPT_DIR}/env.yaml"
+
+echo "name: $NAME
+channels:
+- conda-forge
+dependencies:
+- python=3.8
+- pip
+- pip:" > "$ENV_FILE"
+awk '{ print  "  - " $0 }' requirements.txt >> "$ENV_FILE"
+
+conda env create \
+  --prefix "$ENV_DIR" \
+  --file "$ENV_FILE"
+rm "$ENV_FILE"
+
+cat >"${REPO_DIR}/notebooks/kernels/metatlas-targeted.kernel.json" <<EOL
+{
+ "argv": [
+  "${ENV_DIR}/bin/python",
+  "-m",
+  "ipykernel_launcher",
+  "-f",
+  "{connection_file}"
+ ],
+ "display_name": "Metatlas Targeted",
+ "language": "python"
+}
+EOL
diff --git a/conda/requirements.txt b/conda/requirements.txt
new file mode 120000
index 00000000..a5f86039
--- /dev/null
+++ b/conda/requirements.txt
@@ -0,0 +1 @@
+../docker/requirements.txt
\ No newline at end of file
diff --git a/docker/metatlas_env.yaml b/docker/metatlas_env.yaml
deleted file mode 100644
index ab42e967..00000000
--- a/docker/metatlas_env.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
----
-name: metatlas
-channels:
-  - conda-forge
-dependencies:
-  - alembic=1.6.2
-  - banal=1.0.6
-  - colorama=0.4.4
-  - dataset=1.5.0
-  - dill=0.3.3
-  - gspread=3.7.0
-  - hdf5=1.10.6
-  - humanize=3.5.0
-  - ipympl=0.7.0
-  - ipywidgets=7.6.3
-  - jq=1.6
-  - jupyterlab=3.0.16
-  - matplotlib=3.4.2
-  - oauth2client=4.1.3
-  - pandas=1.2.4
-  - papermill=2.3.3
-  - pip=21.1.1
-  - pymysql=1.0.2
-  - pytables=3.6.1
-  - pyyaml=5.4.1
-  - rdkit=2021.03.2
-  - scipy=1.6.3
-  - sqlalchemy=1.4.15
-  - tabulate=0.8.9
-  - xlsxwriter=1.4.3
diff --git a/notebooks/kernels/metatlas-targeted.kernel.json b/notebooks/kernels/metatlas-targeted.kernel.json
index 6126bcc4..c1cb0d7a 100644
--- a/notebooks/kernels/metatlas-targeted.kernel.json
+++ b/notebooks/kernels/metatlas-targeted.kernel.json
@@ -1,6 +1,6 @@
 {
  "argv": [
-  "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
+  "/global/common/software/m2650/metatlas-targeted-2021-07-16/bin/python",
   "-m",
   "ipykernel_launcher",
   "-f",

From 7210339b2666d6d5e55c2d07eb66a3110d6e87fc Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 20 Jul 2021 14:30:50 -0700
Subject: [PATCH 052/177] fixes to RT predict and better env validation

---
 metatlas/datastructures/metatlas_dataset.py |  31 ++-
 metatlas/plots/dill2plots.py                |   2 -
 metatlas/tools/environment.py               |  38 +++-
 metatlas/tools/notebook.py                  |  28 +--
 metatlas/tools/predict_rt.py                | 205 ++++++++++++--------
 notebooks/reference/Targeted.ipynb          |  22 ++-
 tests/unit/test_metatlas_objects.py         |   2 +-
 7 files changed, 198 insertions(+), 130 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 543fc44b..82416c13 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -25,7 +25,7 @@
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
 SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
 
-OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC"]
+OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
 
 logger = logging.getLogger(__name__)
 
@@ -102,11 +102,20 @@ def analysis_number(self):
         """Returns analysis number"""
         return self._analysis_number
 
+    @property
+    def _exp_tokens(self):
+        """Returns list of strings from the experiment name"""
+        return self.experiment.split("_")
+
+    @property
+    def project(self):
+        """Returns project number (proposal id)"""
+        return self._exp_tokens[3]
+
     @property
     def atlas(self):
         """Atlas identifier (name)"""
-        exp_tokens = self.experiment.split("_")
-        return f"{'_'.join(exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
+        return f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
 
     @property
     def username(self):
@@ -121,8 +130,7 @@ def analysis(self):
     @property
     def short_experiment_analysis(self):
         """Short experiment analysis identifier"""
-        exp_tokens = self.experiment.split("_")
-        return f"{exp_tokens[0]}_{exp_tokens[3]}_{self.output_type}_{self.analysis}"
+        return f"{self._exp_tokens[0]}_{self._exp_tokens[3]}_{self.output_type}_{self.analysis}"
 
     @property
     def short_polarity(self):
@@ -321,7 +329,7 @@ def _remove_compound_id(self, idx):
                 workspace.db.query(f"delete from compoundidentifications where unique_id='{cid_id}'")
             workspace.db.commit()
             del self._atlas.compound_identifications[idx]
-        except Exception as err:
+        except Exception as err:  # pylint: disable=broad-except
             metoh.rollback_and_log(workspace.db, err)
         workspace.close_connection()
 
@@ -351,8 +359,9 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None):
                 [compound for idx, compound in enumerate(sample) if idx in keep_idxs] for sample in self._data
             ]
         if remove_idxs is None:
-            remove_idxs = [idx for idx, _ in enumerate(self._atlas.compound_identifications)
-                           if idx not in keep_idxs]
+            remove_idxs = [
+                idx for idx, _ in enumerate(self._atlas.compound_identifications) if idx not in keep_idxs
+            ]
         _ = [self._remove_compound_id(idx) for idx in sorted(remove_idxs, reverse=True)]
         logger.info(
             "Filtering reduced atlas from %d to %d compounds (%d removed).",
@@ -661,7 +670,9 @@ def lcmsruns(self):
     @property
     def existing_groups(self):
         """Get your own groups that are prefixed by self.experiment"""
-        return metob.retrieve("Groups", name=f"{self.ids.experiment}%", username=self.ids.username)
+        return metob.retrieve(
+            "Groups", name=f"{self.ids.experiment}%{self.ids.analysis}_%", username=self.ids.username
+        )
 
     @property
     def lcmsruns_dataframe(self):
@@ -952,5 +963,5 @@ def parallel_process(function, data, max_cpus, unit=None):
     kwargs = {"file": sys.stdout, "unit": unit, "colour": "green"}
     if max_cpus > 1 and len(data) > 1:
         with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
-            return list(tqdm.tqdm(pool.imap(function, data), length=len(data), **kwargs))
+            return list(tqdm.tqdm(pool.imap(function, data), total=len(data), **kwargs))
     return [function(i) for i in data]
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 5e4d7a52..5be68e33 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -3148,8 +3148,6 @@ def select_groups_for_analysis(name = '%', description = [], username = '*', do_
     if exclude_list:
         groups = remove_metatlas_objects_by_list(groups,'name',exclude_list)
 
-    print((len(groups)))
-
     if remove_empty:
         groups = filter_empty_metatlas_objects(groups,'items')
     if do_print:
diff --git a/metatlas/tools/environment.py b/metatlas/tools/environment.py
index d6dd5fe7..d6a9363d 100644
--- a/metatlas/tools/environment.py
+++ b/metatlas/tools/environment.py
@@ -1,4 +1,10 @@
-"""Environment setup functions"""
+"""
+Environment setup functions
+
+Try to keep the imports in this file to the python standard libraries.
+Otherwise some of the metatlas_repo/kernel validation errors will
+not correctly report problems with the notebook configuration
+"""
 
 import getpass
 import json
@@ -6,6 +12,7 @@
 import os
 import re
 import shutil
+import sys
 
 from pathlib import Path
 
@@ -38,6 +45,35 @@ def install_kernel():
     logger.info('Kernel installation complete. Reload Jupyter notebook page to see new kernel". ')
 
 
+def validate_kernel():
+    """
+    Raise error if problem with kernel
+    When on NERSC, this will install the correct kernel if needed
+    """
+    allowed_exe = [
+        "/global/common/software/m2650/metatlas-targeted-2021-07-16/bin/python",
+    ]
+    error_msg = "Invalid kernel setting in Jupyter Notebook."
+    on_nersc = "METATLAS_LOCAL" not in os.environ
+    if on_nersc and sys.executable not in allowed_exe:
+        install_kernel()
+        if "/global/common/software/m2650/metatlas-targeted" in sys.executable:
+            logger.critical('Upgraded "Metatlas Targeted" kernel.')
+            logger.critical('Please reselect "Metatlas Targeted" kernel for upgrade to become active.')
+        else:
+            logger.critical('Please check that the kernel is set to "Metatlas Targeted".')
+        raise ValueError(error_msg)
+    try:
+        # pylint: disable=import-outside-toplevel,unused-import
+        import dataset  # noqa: F401
+    except ModuleNotFoundError as module_error:
+        logger.critical(
+            'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
+        )
+        raise ModuleNotFoundError from module_error
+    logger.debug("Kernel validation passed. Using python from %s.", sys.executable)
+
+
 def repo_dir():
     """Returns a string with the path to the root of the Metatlas git repo"""
     return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 810d6aa4..19fc9543 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -2,13 +2,12 @@
 
 import logging
 import os
-import sys
 
 import pandas as pd
 from IPython.core.display import display, HTML
 from metatlas.tools.logging import activate_logging
 from metatlas.tools.logging import activate_module_logging
-from metatlas.tools.environment import install_kernel
+from metatlas.tools.environment import validate_kernel
 
 
 logger = logging.getLogger(__name__)
@@ -26,31 +25,6 @@ def configure_environment(log_level):
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 
 
-def validate_kernel():
-    """
-    Raise error if problem with kernel
-    When on NERSC, this will install the correct kernel if needed
-    """
-    allowed_exe = [
-        "/global/common/software/m2650/metatlas-targeted-20210521/bin/python",
-    ]
-    error_msg = "Invalid kernel setting in Jupyter Notebook."
-    on_nersc = "METATLAS_LOCAL" not in os.environ
-    if on_nersc and sys.executable not in allowed_exe:
-        install_kernel()
-        logger.critical('Please check that the kernel is set to "Metatlas Targeted".')
-        raise ValueError(error_msg)
-    try:
-        # pylint: disable=import-outside-toplevel,unused-import
-        import dataset  # noqa: F401
-    except ModuleNotFoundError as module_error:
-        logger.critical(
-            'Could not find dataset module. Please check that the kernel is set to "Metatlas Targeted".'
-        )
-        raise ModuleNotFoundError from module_error
-    logger.debug("Kernel validation passed. Using python from %s.", sys.executable)
-
-
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
     """Set pandas display options"""
     logger.debug("Settings pandas display options")
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index e88c80ce..ee83132d 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -2,13 +2,16 @@
 # pylint: disable=too-many-arguments
 
 import itertools
+import logging
 import math
 import os
+import sys
 
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
 import numpy as np
 import pandas as pd
+import tqdm
 
 from matplotlib import gridspec
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -17,11 +20,13 @@
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 
+logger = logging.getLogger(__name__)
 
 TEMPLATES = {
-    "postive": [
+    "positive": [
         "HILICz150_ANT20190824_TPL_EMA_Unlab_POS",
         "HILICz150_ANT20190824_TPL_QCv3_Unlab_POS",
         "HILICz150_ANT20190824_TPL_ISv5_Unlab_POS",
@@ -37,6 +42,8 @@
     ],
 }
 
+TQDM_CONFIG = {"file": sys.stdout, "colour": "green"}
+
 
 class Model:
     """Encapsulate both linear and polynomial models in a consistent interface"""
@@ -50,7 +57,7 @@ def __init__(self, sk_model, intercept, coefficents):
         """
         self.sk_model = sk_model
         self.intercept = intercept
-        if isinstance(coefficents, list):
+        if isinstance(coefficents, (list, np.ndarray)):
             self.coefficents = coefficents
         else:
             self.coefficents = [coefficents]
@@ -65,7 +72,7 @@ def __repr__(self):
     @property
     def order(self):
         """Polynomial order of the model"""
-        return len(self.coefficents)
+        return 1 if len(self.coefficents) == 1 else len(self.coefficents) - 1
 
     @property
     def name(self):
@@ -74,7 +81,10 @@ def name(self):
 
     def predict(self, x_values):
         """Returns y values for input x"""
-        return self.sk_model.predict(x_values)
+        x_transformed = x_values.reshape(-1, 1)
+        if self.order > 1:
+            x_transformed = np.array([[i[0] ** n for n in range(self.order + 1)] for i in x_transformed])
+        return self.sk_model.predict(x_transformed)
 
 
 def generate_rt_correction_models(
@@ -86,7 +96,7 @@ def generate_rt_correction_models(
         ids: an AnalysisIds object matching the one used in the main notebook
         groups_controlled_vocab: list of strings that will group together when creating groups
                                  application of groups_controlled_vocab is case insensitive
-        exclude_files: list of strings that will exclude files if they are substrings of the filename
+        exclude_files: list of strings that will exclude files if they are substrings of the file name
         include_groups: group will only be used in correction if their name has a substring match
                         to this list of strings
         cpus: max number of cpus to use
@@ -94,25 +104,26 @@ def generate_rt_correction_models(
     """
     # pylint: disable=too-many-locals
     metatlas_dataset = mads.MetatlasDataset(ids, groups_controlled_vocab, exclude_files, save_metadata=False)
-    qc_dir = os.path.join(ids.output_dir, "data_QC")
     groups = get_groups(metatlas_dataset, include_groups)
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
-    save_measured_rts(metatlas_dataset, os.path.join(qc_dir, "QC_Measured_RTs.csv"))
+    save_measured_rts(metatlas_dataset, os.path.join(ids.output_dir, "QC_Measured_RTs.csv"))
     rts_df = get_rts(metatlas_dataset)
-    plot_compound_atlas_rts(metatlas_dataset, rts_df, os.path.join(qc_dir, "Compound_Atlas_RTs.pdf"))
-    selected_column = 9  # need to deal with this parameter, index from rts_df.columns
+    compound_atlas_rts_file_name = os.path.join(ids.output_dir, "Compound_Atlas_RTs.pdf")
+    plot_compound_atlas_rts(len(metatlas_dataset), rts_df, compound_atlas_rts_file_name)
+    selected_column = "median"
     actual_df, pred_df = actual_and_predicted_df(selected_column, rts_df, qc_atlas_df)
     linear, poly = generate_models(actual_df, pred_df)
     actual_rts, pred_rts = actual_and_predicted_rts(rts_df, qc_atlas_df, actual_df, pred_df)
-    actual_vs_pred_file_name = os.path.join(qc_dir, "Actual_vs_Predicted_RTs.pdf")
+    actual_vs_pred_file_name = os.path.join(ids.output_dir, "Actual_vs_Predicted_RTs.pdf")
     plot_actual_vs_pred_rts(pred_rts, actual_rts, rts_df, actual_vs_pred_file_name, linear, poly)
-    rt_comparison_file_name = os.path.join(qc_dir, "RT_Predicted_Model_Comparison.csv")
+    rt_comparison_file_name = os.path.join(ids.output_dir, "RT_Predicted_Model_Comparison.csv")
     save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, rt_comparison_file_name)
-    models_file_name = os.path.join(qc_dir, "rt_model.txt")
+    models_file_name = os.path.join(ids.output_dir, "rt_model.txt")
     write_models(models_file_name, linear, poly, groups, qc_atlas)
-    create_adjusted_atlases(linear, poly, qc_dir, save_to_db=save_to_db)
+    create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
+    logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")
 
 
 def get_groups(metatlas_dataset, include_groups):
@@ -126,13 +137,16 @@ def get_groups(metatlas_dataset, include_groups):
     metatlas_dataset.store_groups(exist_ok=True)
     ids = metatlas_dataset.ids
     groups = dp.select_groups_for_analysis(
-        name=f"{ids.experiment}_{ids.short_polarity}_%",
+        name=f"{ids.experiment}_{ids.short_polarity}_%{ids.analysis}_%",
         most_recent=True,
         remove_empty=True,
         include_list=include_groups,
         exclude_list=ids.short_polarity_inverse,
+        do_print=False,
     )
-    return sorted(groups, key=lambda x: x.name)
+    ordered_groups = sorted(groups, key=lambda x: x.name)
+    _ = [logger.info("Selected group: %s, %s", grp.name, grp.last_modified) for grp in groups]
+    return ordered_groups
 
 
 def get_files_df(groups):
@@ -151,6 +165,7 @@ def get_files_df(groups):
 def get_qc_atlas(ids):
     """Retreives template QC atlas and return tuple (atlas, atlas_df)"""
     qc_atlas_name = f"HILICz150_ANT20190824_TPL_QCv3_Unlab_{ids.short_polarity}"
+    logger.info("Loading QC Atlas %s", qc_atlas_name)
     atlas = metob.retrieve("Atlas", name=qc_atlas_name, username="vrsingan")[0]
     atlas_df = ma_data.make_atlas_df(atlas)
     atlas_df["label"] = [cid.name for cid in atlas.compound_identifications]
@@ -167,21 +182,17 @@ def load_runs(files_df, qc_atlas_df, qc_atlas, cpus):
         cpus: number of cpus to use
     """
     files = [(i[1].file, i[1].group, qc_atlas_df, qc_atlas) for i in files_df.iterrows()]
+    logger.info("Loading LCMS data files")
     return mads.parallel_process(ma_data.get_data_for_atlas_df_and_file, files, cpus, unit="sample")
 
 
-def save_measured_rts(metatlas_dataset, filename):
+def save_measured_rts(metatlas_dataset, file_name):
     """Save RT values in csv format file"""
-    rts_df = dp.make_output_dataframe(
-        input_dataset=metatlas_dataset,
-        fieldname="rt_peak",
-        use_labels=True,
-        summarize=True,
-    )
-    rts_df.to_csv(filename)
+    rts_df = get_rts(metatlas_dataset, include_atlas_rt_peak=False)
+    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "measured RT values")
 
 
-def get_rts(metatlas_dataset):
+def get_rts(metatlas_dataset, include_atlas_rt_peak=True):
     """Returns RT values in DataFrame format"""
     rts_df = dp.make_output_dataframe(
         input_dataset=metatlas_dataset,
@@ -189,44 +200,46 @@ def get_rts(metatlas_dataset):
         use_labels=True,
         summarize=True,
     )
-    rts_df["atlas RT peak"] = [
-        compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
-    ]
+    if include_atlas_rt_peak:
+        rts_df["atlas RT peak"] = [
+            compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
+        ]
     return rts_df
 
 
-def plot_compound_atlas_rts(num_files, rts_df, file_name):
+def plot_compound_atlas_rts(num_files, rts_df, file_name, fontsize=2, pad=0.1, cols=8):
     """
     Writes plot of RT peak for vs file for each compound
     inputs:
         num_files: number of files in data set, ie len(metatlas_dataset)
         rts_df: Dataframe with RTs values
-        filename: where to save plot
+        file_name: where to save plot
+        fontsize: size of text
+        pad: padding size
+        cols: number of columns in plot grid
     """
     # pylint: disable=too-many-locals
-    # number of columns in rts_df that are not values from a specific input file
-    num_not_files = len(rts_df.columns) - num_files
+    logger.info("Plotting RT Peak vs file for each compound")
     rts_df_plot = (
         rts_df.sort_values(by="standard deviation", ascending=False, na_position="last")
         .drop(["#NaNs"], axis=1)
-        .dropna(axis=0, how="all", subset=rts_df.columns[:-num_not_files])
+        .dropna(axis=0, how="all", subset=rts_df.columns[:num_files])
     )
-    fontsize = 2
-    pad = 0.1
-    cols = 8
     rows = int(math.ceil((rts_df.shape[0] + 1) / 8))
     fig = plt.figure()
     grid = gridspec.GridSpec(rows, cols, figure=fig, wspace=0.2, hspace=0.4)
-    for i, (_, row) in enumerate(rts_df_plot.iterrows()):
+    for i, (_, row) in tqdm.tqdm(
+        enumerate(rts_df_plot.iterrows()), total=len(rts_df_plot), unit="plot", **TQDM_CONFIG
+    ):
         a_x = fig.add_subplot(grid[i])
         a_x.tick_params(direction="in", length=1, pad=pad, width=0.1, labelsize=fontsize)
-        a_x.scatter(range(rts_df_plot.shape[1] - num_not_files), row[:-num_not_files], s=0.2)
-        ticks_loc = np.arange(0, len(rts_df_plot.columns) - num_not_files, 1.0)
-        a_x.a_xhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
-        a_x.set_xlim(-0.5, len(rts_df_plot.columns) - num_not_files + 0.5)
-        a_x.xa_xis.set_major_locator(mticker.FixedLocator(ticks_loc))
-        range_columns = list(rts_df_plot.columns[:-num_not_files]) + ["atlas RT peak"]
-        a_x.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanma_x(row.loc[range_columns]) + 0.12)
+        a_x.scatter(range(num_files), row[:num_files], s=0.2)
+        ticks_loc = np.arange(0, num_files, 1.0)
+        a_x.axhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
+        a_x.set_xlim(-0.5, num_files + 0.5)
+        a_x.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
+        range_columns = list(rts_df_plot.columns[:num_files]) + ["atlas RT peak"]
+        a_x.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanmax(row.loc[range_columns]) + 0.12)
         _ = [s.set_linewidth(0.1) for s in a_x.spines.values()]
         # truncate name so it fits above a single subplot
         a_x.set_title(row.name[:33], pad=pad, fontsize=fontsize)
@@ -263,7 +276,7 @@ def actual_and_predicted_df(selected_column, rts_df, atlas_df):
         atlas_df: QC atlas in dataframe format
     return a tuple of (actual_df, pred_df)
     """
-    actual_df = rts_df.loc[:, rts_df.columns[selected_column]]
+    actual_df = rts_df.loc[:, selected_column]
     bad_qc_compounds = np.where(~np.isnan(actual_df))
     actual_df = actual_df.iloc[bad_qc_compounds]
     pred_df = atlas_df.iloc[bad_qc_compounds][["rt_peak"]]
@@ -330,22 +343,29 @@ def save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, fi
         rts_df: dataframe with RT values
         linear: instance of class Model with first order model
         poly: instance of class Model with second order model
-        filename: where to save the plot
+        file_name: where to save the plot
     """
-    qc_df = rts_df[[rts_df.columns[selected_column]]].copy()
+    qc_df = rts_df[[selected_column]].copy()
     qc_df.columns = ["RT Measured"]
-    qc_df["RT Reference"] = qc_atlas_df["rt_peak"]
-    qc_df["RT Linear Pred"] = linear.predict(qc_df["RT Reference"])
-    qc_df["RT Polynomial Pred"] = poly.predict(qc_df["RT Reference"])
+    # qc_df["RT Reference"] = qc_atlas_df["rt_peak"]
+    qc_df.loc[:, "RT Reference"] = qc_atlas_df["rt_peak"].to_numpy()
+    qc_df.loc[:, "RT Linear Pred"] = pd.Series(
+        linear.predict(qc_df["RT Reference"].to_numpy()), index=qc_df.index
+    )
+    qc_df.loc[:, "RT Polynomial Pred"] = pd.Series(
+        poly.predict(qc_df["RT Reference"].to_numpy()), index=qc_df.index
+    )
+    # qc_df["RT Linear Pred"] = linear.predict(qc_df["RT Reference"].to_numpy())
+    # qc_df["RT Polynomial Pred"] = poly.predict(qc_df["RT Reference"].to_numpy())
     qc_df["RT Diff Linear"] = qc_df["RT Measured"] - qc_df["RT Linear Pred"]
     qc_df["RT Diff Polynomial"] = qc_df["RT Measured"] - qc_df["RT Polynomial Pred"]
-    qc_df.to_csv(file_name)
+    write_utils.export_dataframe_die_on_diff(qc_df, file_name, "model comparision")
 
 
 def write_models(file_name, linear_model, poly_model, groups, atlas):
     """
     inputs:
-        filename: text file to save model information
+        file_name: text file to save model information
         linear_model: instance of class Model with first order model
         poly_model: instance of class Model with second order model
         groups: list of groups used in model generation
@@ -360,12 +380,28 @@ def write_models(file_name, linear_model, poly_model, groups, atlas):
             out_fh.write(f"atlas = {atlas.name}\n\n")
 
 
-def create_adjusted_atlases(linear, poly, qc_dir, atlas_indices=None, free_text="", save_to_db=True):
+def get_atlas_name(template_name, ids, model, free_text):
+    """
+    input:
+        template_name: name of template atlas
+        ids: an AnalysisIds object matching the one used in the main notebook
+        model: an instance of Model
+        free_text: arbitrary string to append to atlas name
+    returns the name of the production atlas
+    """
+    prod_name = template_name.replace("TPL", "PRD")
+    prod_atlas_name = f"{prod_name}_{model.name}_{ids.project}_{ids.analysis}"
+    if free_text != "":
+        prod_atlas_name += f"_{free_text}"
+    return prod_atlas_name
+
+
+def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="", save_to_db=True):
     """
     input:
         linear_model: instance of class Model with first order model
         poly_model: instance of class Model with second order model
-        qc_dir: directory to write csv files to
+        ids: an AnalysisIds object matching the one used in the main notebook
         atlas_indices: list of integers for which adjusted atlases to create
                         0: EMA_Unlab
                         1: QCv3_Unlab
@@ -375,34 +411,39 @@ def create_adjusted_atlases(linear, poly, qc_dir, atlas_indices=None, free_text=
         free_text: arbitrary string to append to atlas name
         save_to_db: if True, save the atlases to the database
     """
-    if atlas_indices is None:
-        atlas_indices = [0, 4]
-    for polarity in ["positive", "negative"]:
-        for idx in atlas_indices:
-            for model in [linear, poly]:
-                template_name = TEMPLATES[polarity][idx]
-                atlas = metob.retrieve("Atlas", name=template_name, username="vrsingan")[-1]
-                prd_atlas_name = template_name.replace("TPL", "PRD") + f"_{model.name}"
-                if free_text != "":
-                    prd_atlas_name = prd_atlas_name + "_" + free_text
-                prd_atlas_filename = prd_atlas_name + ".csv"
-                prd_atlas_df = ma_data.make_atlas_df(atlas)
-                prd_atlas_df["label"] = [cid.name for cid in atlas.compound_identifications]
-                prd_atlas_df["rt_peak"] = model.predict(prd_atlas_df["rt_peak"])
-                prd_atlas_df["rt_min"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
-                prd_atlas_df["rt_max"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
-                prd_atlas_df.to_csv(os.path.join(qc_dir, prd_atlas_filename), index=False)
-                if save_to_db:
-                    dp.make_atlas_from_spreadsheet(
-                        prd_atlas_df,
-                        prd_atlas_name,
-                        filetype="dataframe",
-                        sheetname="",
-                        polarity=polarity,
-                        store=True,
-                        mz_tolerance=12,
-                    )
-                print(prd_atlas_name + " Created!")
+    atlas_indices = [0, 4] if atlas_indices is None else atlas_indices
+    plot_vars = [
+        (polarity, idx, model)
+        for polarity in ["positive", "negative"]
+        for idx in atlas_indices
+        for model in [linear, poly]
+    ]
+    for polarity, idx, model in tqdm.tqdm(plot_vars, unit="atlas", **TQDM_CONFIG):
+        template_name = TEMPLATES[polarity][idx]
+        atlas = metob.retrieve("Atlas", name=template_name, username="vrsingan")[-1]
+        prd_atlas_name = get_atlas_name(template_name, ids, model, free_text)
+        logger.info("Creating atlas %s", prd_atlas_name)
+        prd_atlas_file_name = os.path.join(ids.output_dir, f"{prd_atlas_name}.csv")
+        prd_atlas_df = ma_data.make_atlas_df(atlas)
+        prd_atlas_df["label"] = [cid.name for cid in atlas.compound_identifications]
+        prd_atlas_df["rt_peak"] = model.predict(prd_atlas_df["rt_peak"].to_numpy())
+        prd_atlas_df["rt_min"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
+        prd_atlas_df["rt_max"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
+        write_utils.export_dataframe_die_on_diff(
+            prd_atlas_df, prd_atlas_file_name, "predicted atlas", index=False
+        )
+        logger.info("Atlas exported as %s", prd_atlas_file_name)
+        if save_to_db:
+            dp.make_atlas_from_spreadsheet(
+                prd_atlas_df,
+                prd_atlas_name,
+                filetype="dataframe",
+                sheetname="",
+                polarity=polarity,
+                store=True,
+                mz_tolerance=12,
+            )
+            logger.info("Atlas %s stored in database", prd_atlas_name)
 
 
 def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number=0, polarity="positive"):
@@ -415,5 +456,5 @@ def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_n
         polarity: defaults to 'positive', set to 'negative' if you only have neg mode data
     Returns an AnalysisIds instance
     """
-    ids = mads.AnalysisIdentifiers(None, experiment, "ISTDsEtc", polarity, analysis_number, project_directory)
+    ids = mads.AnalysisIdentifiers(None, experiment, "data_QC", polarity, analysis_number, project_directory)
     return ids
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 087049bb..8eb9a677 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -127,19 +127,27 @@
     "    )\n",
     "    raise ValueError(\"Invalid metatlas_repo_path parameter in Jupyter Notebook.\")\n",
     "try:\n",
-    "    from metatlas.datastructures import metatlas_dataset as mads  # noqa: E402\n",
-    "    from metatlas.tools import notebook  # noqa: E402\n",
+    "    from metatlas.tools import environment  # noqa: E402\n",
+    "\n",
+    "    environment.validate_kernel()\n",
     "except ModuleNotFoundError as err:\n",
     "    logging.critical(\n",
     "        (\n",
-    "            \"Could not find metatlas module at %s. \"\n",
+    "            \"Could not find metatlas repo at %s. \"\n",
     "            \"In the Parameters block, please check the value of metatlas_repo_path.\"\n",
     "        ),\n",
     "        metatlas_repo_path,\n",
     "    )\n",
     "    raise ModuleNotFoundError from err\n",
-    "%matplotlib widget\n",
-    "notebook.setup(log_level)"
+    "except ImportError as err:\n",
+    "    logging.critical(\"A newer version of metatlas_repo is required to use this notebook.\")\n",
+    "    raise ImportError from err\n",
+    "from metatlas.tools import notebook  # noqa: E402\n",
+    "\n",
+    "notebook.setup(log_level)\n",
+    "from metatlas.datastructures import metatlas_dataset as mads  # noqa: E402\n",
+    "\n",
+    "%matplotlib widget"
    ]
   },
   {
@@ -214,7 +222,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -228,7 +236,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.11"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,
diff --git a/tests/unit/test_metatlas_objects.py b/tests/unit/test_metatlas_objects.py
index 927df141..5b2a6def 100644
--- a/tests/unit/test_metatlas_objects.py
+++ b/tests/unit/test_metatlas_objects.py
@@ -231,4 +231,4 @@ def test_dill():
     test = mo.Group(items=[mo.Group(description="hello")])
     blob = dill.dumps(test)
     new = dill.loads(blob)
-    assert new.items[0].description == "hello"
+    assert new.items[0].description == "hello"  # pylint: disable=no-member

From 6566a11c22b9226494a6e8240507b2976ad96fab Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jul 2021 17:16:22 -0700
Subject: [PATCH 053/177] Add auto notebook generation, use TQDM widget

---
 metatlas/datastructures/metatlas_dataset.py |  6 +-
 metatlas/plots/dill2plots.py                | 10 ++-
 metatlas/tools/notebook.py                  | 33 ++++++++++
 metatlas/tools/predict_rt.py                | 72 ++++++++++++++++-----
 notebooks/reference/RT_Prediction.ipynb     |  6 +-
 papermill/slurm_template.sh                 | 21 ++++++
 6 files changed, 120 insertions(+), 28 deletions(-)
 create mode 100755 papermill/slurm_template.sh

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 82416c13..a1b477a4 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -7,12 +7,11 @@
 import numbers
 import os
 import shutil
-import sys
 import tarfile
 
 import humanize
 import pandas as pd
-import tqdm
+from tqdm.notebook import tqdm
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh
@@ -960,8 +959,7 @@ def parallel_process(function, data, max_cpus, unit=None):
         max_cpus: number of cpus to use
         unit: string label for what is processed in one iteration, default 'it'
     """
-    kwargs = {"file": sys.stdout, "unit": unit, "colour": "green"}
     if max_cpus > 1 and len(data) > 1:
         with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
-            return list(tqdm.tqdm(pool.imap(function, data), total=len(data), **kwargs))
+            return list(tqdm(pool.imap(function, data), total=len(data), unit=unit))
     return [function(i) for i in data]
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 5be68e33..1ab37554 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -18,6 +18,7 @@
 from metatlas.io.metatlas_get_data_helper_fun import extract
 # from metatlas import gui
 
+from tqdm.notebook import tqdm
 from textwrap import fill, TextWrapper
 # import qgrid
 import pandas as pd
@@ -2109,9 +2110,7 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
                              'inchi_key', 'precursor_mz', 'measured_precursor_mz',
                              'measured_precursor_intensity']
     msms_hits = pd.DataFrame(columns=all_cols).set_index(index_cols)
-    for compound_idx, _ in enumerate(compound_names):
-        sys.stdout.write('\r'+'Processing: {} / {} compounds.'.format(compound_idx+1, len(compound_names)))
-        sys.stdout.flush()
+    for compound_idx, _ in tqdm(enumerate(compound_names), unit='compound'):
         cid = metatlas_dataset[0][compound_idx]['identification']
         name = cid.name.split('///')[0] if cid.name else getattr(cid.compound[-1], 'name', None)
         adduct = ma_data.extract(cid, ['mz_references', 0, 'adduct'], None)
@@ -2120,7 +2119,7 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
         precursor_mz = cid.mz_references[0].mz
         rt_min = cid.rt_references[0].rt_min
         rt_max = cid.rt_references[0].rt_max
-        for file_idx, file_name in enumerate(file_names):
+        for file_idx, file_name in tqdm(enumerate(file_names), unit='file'):
             mfc = metatlas_dataset[file_idx][compound_idx]
             polarity = mfc['identification'].mz_references[0].detected_polarity
             try:
@@ -2160,7 +2159,6 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
                     scan_df['msv_query_aligned'] = [msv_sample]
                     scan_df['msv_ref_aligned'] = [np.full_like(msv_sample, np.nan)]
                 msms_hits = msms_hits.append(scan_df)
-    sys.stdout.write('\n'+'Done!!!\n')
     return msms_hits
 
 
@@ -3009,7 +3007,7 @@ def make_atlas_from_spreadsheet(filename, atlas_name, filetype, sheetname=None,
     check_filenames(atlas_df, 'file_msms')
     atlas = get_atlas(atlas_name, atlas_df, polarity, mz_tolerance)
     if store:
-        logger.debug('Saving atlas named %s to DB.', atlas_name)
+        logger.info('Saving atlas named %s to DB.', atlas_name)
         metob.store(atlas)
     return atlas
 
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 19fc9543..2cf87fcb 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -1,5 +1,6 @@
 """Jupyter notebook helper functions"""
 
+import json
 import logging
 import os
 
@@ -60,3 +61,35 @@ def activate_sql_logging(console_level="INFO", console_format=None, file_level="
     """
     logger.debug("Activaing SQL logging with console_level=%s and file_level=%s.", console_level, file_level)
     activate_module_logging("sqlalchemy.engine", console_level, console_format, file_level, filename)
+
+
+def create_notebook(input_file_name, output_file_name, parameters, injection_cell=2):
+    """
+    Copies from input_file_name to output_file_name and then places the parameters into a
+    cell of the output notebook.
+    inputs:
+        input_file_name: source notebook
+        output_file_name: destination notebook
+        parameters: dict where keys are LHS of assignment and values are RHS of assignment
+        injection_cell: zero-indexed number of cell to overwrite with the parameters
+    """
+    with open(input_file_name, "r") as in_fh:
+        notebook = json.load(in_fh)
+    notebook["cells"][injection_cell]["source"] = [assignment_string(k, v) for k, v in parameters.items()]
+    with open(output_file_name, "w", encoding="utf-8") as out_fh:
+        json.dump(notebook, out_fh, ensure_ascii=False, indent=4)
+    logger.info("Created jupyter notebook %s", output_file_name)
+
+
+def assignment_string(lhs, rhs):
+    """
+    inputs:
+        lhs: name of variable to be assigned value
+        rhs: python object that will be assigned
+    returns a string
+    """
+    if isinstance(rhs, bool):
+        rhs_str = "True" if rhs else "False"
+    else:
+        rhs_str = json.dumps(rhs)
+    return f"{lhs} = {rhs_str}\n"
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index ee83132d..9929c35c 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -5,23 +5,25 @@
 import logging
 import math
 import os
-import sys
+
+from pathlib import Path
 
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
 import numpy as np
 import pandas as pd
-import tqdm
 
 from matplotlib import gridspec
 from sklearn.linear_model import LinearRegression, RANSACRegressor
 from sklearn.preprocessing import PolynomialFeatures
+from tqdm.notebook import tqdm
 
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
+from metatlas.tools import notebook
 
 logger = logging.getLogger(__name__)
 
@@ -42,8 +44,6 @@
     ],
 }
 
-TQDM_CONFIG = {"file": sys.stdout, "colour": "green"}
-
 
 class Model:
     """Encapsulate both linear and polynomial models in a consistent interface"""
@@ -88,7 +88,14 @@ def predict(self, x_values):
 
 
 def generate_rt_correction_models(
-    ids, groups_controlled_vocab, exclude_files, include_groups, cpus, save_to_db=True
+    ids,
+    groups_controlled_vocab,
+    exclude_files,
+    include_groups,
+    cpus,
+    repo_dir,
+    save_to_db=True,
+    use_poly_model=True,
 ):
     """
     Generate the RT correction models and associated atlases with adjusted RT values
@@ -100,7 +107,11 @@ def generate_rt_correction_models(
         include_groups: group will only be used in correction if their name has a substring match
                         to this list of strings
         cpus: max number of cpus to use
+        repo_dir: location of metatlas git repo on local filesystem
         save_to_db: If True, save the new atlases to the database
+        use_poly_model: If True, use the polynomial model, else use linear model
+                        Both types of models are always generated, this only determines which ones
+                        are pre-populated into the generated notebooks
     """
     # pylint: disable=too-many-locals
     metatlas_dataset = mads.MetatlasDataset(ids, groups_controlled_vocab, exclude_files, save_metadata=False)
@@ -122,7 +133,8 @@ def generate_rt_correction_models(
     save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, rt_comparison_file_name)
     models_file_name = os.path.join(ids.output_dir, "rt_model.txt")
     write_models(models_file_name, linear, poly, groups, qc_atlas)
-    create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
+    atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
+    write_notebooks(ids, atlases, repo_dir, use_poly_model)
     logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")
 
 
@@ -218,7 +230,6 @@ def plot_compound_atlas_rts(num_files, rts_df, file_name, fontsize=2, pad=0.1, c
         pad: padding size
         cols: number of columns in plot grid
     """
-    # pylint: disable=too-many-locals
     logger.info("Plotting RT Peak vs file for each compound")
     rts_df_plot = (
         rts_df.sort_values(by="standard deviation", ascending=False, na_position="last")
@@ -228,16 +239,13 @@ def plot_compound_atlas_rts(num_files, rts_df, file_name, fontsize=2, pad=0.1, c
     rows = int(math.ceil((rts_df.shape[0] + 1) / 8))
     fig = plt.figure()
     grid = gridspec.GridSpec(rows, cols, figure=fig, wspace=0.2, hspace=0.4)
-    for i, (_, row) in tqdm.tqdm(
-        enumerate(rts_df_plot.iterrows()), total=len(rts_df_plot), unit="plot", **TQDM_CONFIG
-    ):
+    for i, (_, row) in tqdm(enumerate(rts_df_plot.iterrows()), total=len(rts_df_plot), unit="plot"):
         a_x = fig.add_subplot(grid[i])
         a_x.tick_params(direction="in", length=1, pad=pad, width=0.1, labelsize=fontsize)
         a_x.scatter(range(num_files), row[:num_files], s=0.2)
-        ticks_loc = np.arange(0, num_files, 1.0)
         a_x.axhline(y=row["atlas RT peak"], color="r", linestyle="-", linewidth=0.2)
         a_x.set_xlim(-0.5, num_files + 0.5)
-        a_x.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
+        a_x.xaxis.set_major_locator(mticker.FixedLocator(np.arange(0, num_files, 1.0)))
         range_columns = list(rts_df_plot.columns[:num_files]) + ["atlas RT peak"]
         a_x.set_ylim(np.nanmin(row.loc[range_columns]) - 0.12, np.nanmax(row.loc[range_columns]) + 0.12)
         _ = [s.set_linewidth(0.1) for s in a_x.spines.values()]
@@ -410,7 +418,9 @@ def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="",
                         4: IS_LabUnlab2
         free_text: arbitrary string to append to atlas name
         save_to_db: if True, save the atlases to the database
+    returns a list of the names of atlases
     """
+    # pylint: disable=too-many-locals
     atlas_indices = [0, 4] if atlas_indices is None else atlas_indices
     plot_vars = [
         (polarity, idx, model)
@@ -418,10 +428,12 @@ def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="",
         for idx in atlas_indices
         for model in [linear, poly]
     ]
-    for polarity, idx, model in tqdm.tqdm(plot_vars, unit="atlas", **TQDM_CONFIG):
+    out = []
+    for polarity, idx, model in tqdm(plot_vars, unit="atlas"):
         template_name = TEMPLATES[polarity][idx]
         atlas = metob.retrieve("Atlas", name=template_name, username="vrsingan")[-1]
         prd_atlas_name = get_atlas_name(template_name, ids, model, free_text)
+        out.append(prd_atlas_name)
         logger.info("Creating atlas %s", prd_atlas_name)
         prd_atlas_file_name = os.path.join(ids.output_dir, f"{prd_atlas_name}.csv")
         prd_atlas_df = ma_data.make_atlas_df(atlas)
@@ -432,7 +444,6 @@ def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="",
         write_utils.export_dataframe_die_on_diff(
             prd_atlas_df, prd_atlas_file_name, "predicted atlas", index=False
         )
-        logger.info("Atlas exported as %s", prd_atlas_file_name)
         if save_to_db:
             dp.make_atlas_from_spreadsheet(
                 prd_atlas_df,
@@ -443,7 +454,38 @@ def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="",
                 store=True,
                 mz_tolerance=12,
             )
-            logger.info("Atlas %s stored in database", prd_atlas_name)
+    return out
+
+
+def write_notebooks(ids, atlases, repo_dir, use_poly_model):
+    """
+    Creates Targeted analysis jupyter notebooks with pre-populated parameter sets
+    Inputs:
+        ids: an AnalysisIds object matching the one used in the main notebook
+        atlases: list of atlas names to use as source atlases
+        repo_dir: location of metatlas git repo on local filesystem
+        use_poly_model: if True use polynomial RT prediction model, else use linear model
+                        this value is used to filter atlases from the input atlases list
+    """
+    for atlas_name in atlases:
+        if (use_poly_model and "linear" in atlas_name) or (not use_poly_model and "polynomial" in atlas_name):
+            continue
+        polarity = "positive" if "_POS_" in atlas_name else "negative"
+        short_polarity = "POS" if polarity == "positive" else "NEG"
+        output_type = "FinalEMA-HILIC" if "EMA_Unlab" in atlas_name else "ISTDsEtc"
+        repo_path = Path(__file__).resolve().parent.parent.parent
+        source = repo_path / "notebooks" / "reference" / "Targeted.ipynb"
+        dest = Path(ids.output_dir).resolve().parent / f"{ids.project}_{output_type}_{short_polarity}.ipynb"
+        parameters = {
+            "experiment": ids.experiment,
+            "output_type": output_type,
+            "polarity": polarity,
+            "analysis_number": 0,
+            "metatlas_repo_path": repo_dir,
+            "project_directory": ids.project_directory,
+            "source_atlas": atlas_name,
+        }
+        notebook.create_notebook(source, dest, parameters)
 
 
 def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number=0, polarity="positive"):
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 27c8f93e..4e9d4191 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -123,7 +123,7 @@
    "outputs": [],
    "source": [
     "predict_rt.generate_rt_correction_models(\n",
-    "    ids, groups_controlled_vocab, exclude_files, include_groups, max_cpus\n",
+    "    ids, groups_controlled_vocab, exclude_files, include_groups, max_cpus, metatlas_repo_path\n",
     ")"
    ]
   }
@@ -131,7 +131,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -145,7 +145,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.11"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,
diff --git a/papermill/slurm_template.sh b/papermill/slurm_template.sh
new file mode 100755
index 00000000..d76bee15
--- /dev/null
+++ b/papermill/slurm_template.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -N 1
+#SBATCH -C haswell
+#SBATCH --account=gtrnd
+#SBATCH --qos=genepool
+#SBATCH --mail-type=ALL
+#SBATCH -t 02:00:00
+
+#OpenMP settings:
+export OMP_NUM_THREADS=1
+export OMP_PLACES=threads
+export OMP_PROC_BIND=spread
+
+date
+echo "input file: $IN_FILE"
+echo "output file: $OUT_FILE"
+eval "$(conda shell.bash hook)"
+conda activate /global/common/software/m2650/metatlas-targeted-2021-07-16
+
+srun -n 1 -c 64 --cpu_bind=cores papermill "$IN_FILE" "$OUT_FILE" $PARAMETERS
+

From c29c1b690498e9e5c8206ee60ce0ffdc344ed770 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jul 2021 19:39:13 -0700
Subject: [PATCH 054/177] Remove repeated polarity for atlas export filename

---
 metatlas/io/targeted_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 4a350134..88d287a4 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -20,7 +20,7 @@ def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
     """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
     export_atlas_filename = os.path.join(
         metatlas_dataset.ids.output_dir,
-        f"{metatlas_dataset.ids.short_polarity}_{metatlas_dataset.atlas.name}_export.csv",
+        f"{metatlas_dataset.atlas.name}_export.csv",
     )
     write_utils.check_existing_file(export_atlas_filename, overwrite)
     dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)

From 7e66a278982c706980a673cbdb2c5e0bd1175dab Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 26 Jul 2021 09:43:13 -0700
Subject: [PATCH 055/177] Pass max_cpus to allow parallel processing

---
 notebooks/reference/Targeted.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 8eb9a677..093e55ef 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -168,7 +168,7 @@
    "outputs": [],
    "source": [
     "metatlas_dataset = mads.MetatlasDataset(\n",
-    "    ids, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files\n",
+    "    ids, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files, max_cpus=max_cpus\n",
     ")"
    ]
   },

From 1edbe8d3208b2e1dd02b2b02deb2f2ac1ef0c8d5 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 27 Jul 2021 13:32:51 -0700
Subject: [PATCH 056/177] improved logging

---
 metatlas/datastructures/metatlas_dataset.py |  5 ++++-
 metatlas/io/targeted_output.py              | 10 ++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index a1b477a4..27ee8739 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -283,7 +283,6 @@ def _get_atlas(self):
 
     def _build(self):
         """Populate self._data from database and h5 files."""
-        logger.info("Loading data into MetatlasDataset")
         start_time = datetime.datetime.now()
         files = []
         for group in self.groups:
@@ -298,6 +297,7 @@ def _build(self):
                         self.extra_mz,
                     )
                 )
+        logger.info('Reading MSMS data from h5 files')
         samples = parallel_process(
             ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample"
         )
@@ -484,6 +484,7 @@ def atlas_df(self):
         """atlas_df getter, update ._atlas_df if necessary"""
         if not self._atlas_df_valid:
             start_time = datetime.datetime.now()
+            logger.info('Generating atlas_df')
             self._atlas_df = ma_data.make_atlas_df(self.atlas)
             self._atlas_df_valid = True
             logger.info(
@@ -960,6 +961,8 @@ def parallel_process(function, data, max_cpus, unit=None):
         unit: string label for what is processed in one iteration, default 'it'
     """
     if max_cpus > 1 and len(data) > 1:
+        logger.debug('Starting parallel processing of %s with %d cpus.', function.__name__, max_cpus)
         with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
             return list(tqdm(pool.imap(function, data), total=len(data), unit=unit))
+    logger.debug('Processing of %s with 1 cpu.', function.__name__)
     return [function(i) for i in data]
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 88d287a4..8294ea82 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -57,6 +57,12 @@ def write_stats_table(
                        if False filter out row if MSMS thresholds are not passing
         overwrite: if True, will write over existing files
     """
+    prefix = f"{metatlas_dataset.ids.short_polarity}_"
+    scores_path = os.path.join(
+        metatlas_dataset.ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv"
+    )
+    _ = metatlas_dataset.hits  # regenerate hits if needed before logging about scores
+    logger.info('Calculating scores and exporting them to %s.', scores_path)
     scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)
     scores_df["passing"] = fa.test_scores_df(
         scores_df,
@@ -68,10 +74,6 @@ def write_stats_table(
         min_num_frag_matches,
         min_relative_frag_intensity,
     )
-    prefix = f"{metatlas_dataset.ids.short_polarity}_"
-    scores_path = os.path.join(
-        metatlas_dataset.ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv"
-    )
     write_utils.export_dataframe(scores_df, scores_path, "scores", overwrite)
     fa.make_stats_table(
         input_dataset=metatlas_dataset,

From ff92f55a5a74386fda0bd033e7d94772efc53f95 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 27 Jul 2021 13:33:40 -0700
Subject: [PATCH 057/177] Add RT bounds matching for similar compounds

---
 metatlas/plots/dill2plots.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 1ab37554..c860367d 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -626,6 +626,7 @@ def press(self, event):
                              self.data[0][self.compound_idx]['identification'].mz_references[0].adduct
                              )
                 self.hit_ctr = 0
+                self.match_idx = None
                 self.update_plots()
         elif event.key in ['left', 'h']:
             if self.compound_idx > 0:
@@ -636,6 +637,7 @@ def press(self, event):
                              self.data[0][self.compound_idx]['identification'].mz_references[0].adduct
                              )
                 self.hit_ctr = 0
+                self.match_idx = None
                 self.update_plots()
         elif event.key in ['up', 'k']:
             if self.hit_ctr > 0:
@@ -665,6 +667,17 @@ def press(self, event):
                 self.similar_compounds = self.get_similar_compounds()
                 logger.debug("Enabling highlight of similar compounds on EIC plot.")
                 self.highlight_similar_compounds()
+        elif event.key == 'm':
+            num_sim = len(self.similar_compounds)
+            if num_sim > 0:
+                self.match_idx = 0 if self.match_idx is None else (self.match_idx + 1) % num_sim
+                self.match_rts()
+
+    def match_rts(self):
+        """Sets RT min and max to match similar compound referenced by match_idx"""
+        source = self.similar_compounds[self.match_idx]['rt']
+        self.update_rt('rt_min', source.rt_min)
+        self.update_rt('rt_max', source.rt_max)
 
     def update_y_scale(self, val):
         if self.slider_y_min < 0:
@@ -787,6 +800,7 @@ def __init__(self,
         self.y_max = y_max
         self.y_min = y_min
         self.data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
+        self.match_idx = None
 
         # create figure and first axes
         self.fig,self.ax = plt.subplots(figsize=(width, height))
@@ -2082,6 +2096,17 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
                   pre_query='database == "metatlas"', query=None, ref_dtypes=None,
                   ref_loc=None, ref_df=None, frag_mz_tolerance=.005, ref_index=None,
                   do_centroid=False):
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="Mean of empty slice")
+        return get_msms_hits_with_warnings(metatlas_dataset, extra_time, keep_nonmatches, pre_query, query,
+                                           ref_dtypes, ref_loc, ref_df, frag_mz_tolerance, ref_index,
+                                           do_centroid)
+
+
+def get_msms_hits_with_warnings(metatlas_dataset, extra_time=False, keep_nonmatches=False,
+                                pre_query='database == "metatlas"', query=None, ref_dtypes=None,
+                                ref_loc=None, ref_df=None, frag_mz_tolerance=.005, ref_index=None,
+                                do_centroid=False):
     if query is None:
         pre_mz_decimal = ".5*(@pre_mz_ppm**-decimal)/(decimal+1)"
         offset = f".5*(({pre_mz_decimal} + .005 + ({pre_mz_decimal} - .005)**2)**.5)"
@@ -2110,7 +2135,7 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
                              'inchi_key', 'precursor_mz', 'measured_precursor_mz',
                              'measured_precursor_intensity']
     msms_hits = pd.DataFrame(columns=all_cols).set_index(index_cols)
-    for compound_idx, _ in tqdm(enumerate(compound_names), unit='compound'):
+    for compound_idx, _ in enumerate(tqdm(compound_names, unit='compound')):
         cid = metatlas_dataset[0][compound_idx]['identification']
         name = cid.name.split('///')[0] if cid.name else getattr(cid.compound[-1], 'name', None)
         adduct = ma_data.extract(cid, ['mz_references', 0, 'adduct'], None)
@@ -2119,7 +2144,7 @@ def get_msms_hits(metatlas_dataset, extra_time=False, keep_nonmatches=False,
         precursor_mz = cid.mz_references[0].mz
         rt_min = cid.rt_references[0].rt_min
         rt_max = cid.rt_references[0].rt_max
-        for file_idx, file_name in tqdm(enumerate(file_names), unit='file'):
+        for file_idx, file_name in enumerate(file_names):
             mfc = metatlas_dataset[file_idx][compound_idx]
             polarity = mfc['identification'].mz_references[0].detected_polarity
             try:

From a36153c349444fc9d53a795dea16283907f62bab Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 27 Jul 2021 18:11:24 -0700
Subject: [PATCH 058/177] Add slurm submission of RT_predict via papermill

---
 papermill/launch_rt_prediction.sh | 25 +++++++++++++++++++++++++
 papermill/slurm_template.sh       |  4 ++--
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 papermill/launch_rt_prediction.sh

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
new file mode 100644
index 00000000..0eb1dc60
--- /dev/null
+++ b/papermill/launch_rt_prediction.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -euf -o pipefail
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage $: experiment_name analysis_number project_directory"
+fi
+
+EXP="$1"
+ANALYSIS_NUM="$2"
+PROJECT_DIR="$3"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+REPO_DIR="$(dirname "$SCRIPT_DIR")"
+EXP_DIR="${PROJECT_DIR}/$EXP"
+
+IFS='_' read -ra TOKENS <<< "$EXP"
+PROPOSAL="${TOKENS[0]}"
+
+mkdir -p "$EXP_DIR"
+
+export IN_FILE="${REPO_DIR}/notebooks/reference/RT_Prediction.ipynb"
+export OUT_FILE="$/503256_RT_Prediction_papermill_12.ipynb"
+export PARAMETERS="-p experiment $EXP -p metatlas_repo_path $REPO_DIR -p project_directory $PROJECT_DIR -p max_cpus 32 -p analysis_number $ANALYSIS_NUM"
+
+sbatch -J "${PROPOSAL}_RT_Pred" "${REPO_DIR}/papermill/slurm_template.sh"
diff --git a/papermill/slurm_template.sh b/papermill/slurm_template.sh
index d76bee15..388d183e 100755
--- a/papermill/slurm_template.sh
+++ b/papermill/slurm_template.sh
@@ -11,11 +11,11 @@ export OMP_NUM_THREADS=1
 export OMP_PLACES=threads
 export OMP_PROC_BIND=spread
 
+CONDA_DIR="$(dirname "$(dirname "$(grep 'metatlas-targeted' ../notebooks/kernels/metatlas-targeted.kernel.json | cut -d\" -f 2)")")"
 date
 echo "input file: $IN_FILE"
 echo "output file: $OUT_FILE"
 eval "$(conda shell.bash hook)"
-conda activate /global/common/software/m2650/metatlas-targeted-2021-07-16
+conda activate "$CONDA_DIR"
 
 srun -n 1 -c 64 --cpu_bind=cores papermill "$IN_FILE" "$OUT_FILE" $PARAMETERS
-

From dfcedf3fae9f7d551924617a3e67ce1a4ea4284c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 29 Jul 2021 14:26:25 -0700
Subject: [PATCH 059/177] Fix file comparision bug

---
 metatlas/io/write_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index ae8aecaa..7154f073 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -55,7 +55,7 @@ def raise_on_diff(dataframe, file_path, description, **kwargs):
         return
     with tempfile.NamedTemporaryFile(delete=False) as temp_path:
         dataframe.to_csv(temp_path, **kwargs)
-        same = filecmp.cmp(file_path, temp_path.name)
+        same = filecmp.cmp(file_path, temp_path.name, shallow=False)
         os.remove(temp_path.name)
     if same:
         logger.info("Data in %s is the same as %s.", description, file_path)

From e05ea90cbeaf085d88dcc36d47b177035de128f7 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 29 Jul 2021 14:31:01 -0700
Subject: [PATCH 060/177] Improved parallization and rename group methods

---
 metatlas/datastructures/metatlas_dataset.py | 74 +++++++---------
 metatlas/io/targeted_output.py              |  9 +-
 metatlas/plots/dill2plots.py                | 93 ++++++++++-----------
 metatlas/tools/parallel.py                  | 29 +++++++
 metatlas/tools/predict_rt.py                |  5 +-
 noxfile.py                                  |  1 +
 tests/unit/test_metatlas_dataset.py         |  4 +-
 7 files changed, 116 insertions(+), 99 deletions(-)
 create mode 100644 metatlas/tools/parallel.py

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 27ee8739..5aa5e671 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -3,7 +3,6 @@
 import getpass
 import glob
 import logging
-import multiprocessing
 import numbers
 import os
 import shutil
@@ -11,7 +10,6 @@
 
 import humanize
 import pandas as pd
-from tqdm.notebook import tqdm
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh
@@ -19,6 +17,7 @@
 from metatlas.io import targeted_output
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
+from metatlas.tools import parallel
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
@@ -164,6 +163,10 @@ class MetatlasDataset:
     automatically is re-calculated the next time you access the second property. For example:
     metatlas_dataset.extra_time = 0.5  # this invalidates the current hits property
     metatlas_dataset.hits  # this re-generates the hits before returning them
+
+    MetatlasDataset also has methods for updating RT values and identification notes while keeping
+    the atlas, atlas_df, metatlas_dataset, and database in sync. This removes the need to do kernel
+    restarts between steps in the workflow.
     """
 
     # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods
@@ -201,8 +204,8 @@ def __init__(
         self._hits = None
         self._hits_valid = False  # based on all hits dependencies except RT min/max values
         self._hits_valid_for_rt_bounds = False  # based only on RT min/max changes
-        self._groups = None
-        self._groups_valid = False
+        self._all_groups = None
+        self._all_groups_valid = False
         self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
         self._exclude_files = [] if exclude_files is None else exclude_files
         self._extra_time = extra_time
@@ -217,7 +220,7 @@ def __init__(
             logger.debug("Writing MetatlasDataset metadata files")
             self.write_data_source_files()
             self.write_lcmsruns_short_names()
-        self.store_groups(exist_ok=True)
+        self.store_all_groups(exist_ok=True)
 
     def write_data_source_files(self):
         """Write the data source files if they don't already exist"""
@@ -297,9 +300,9 @@ def _build(self):
                         self.extra_mz,
                     )
                 )
-        logger.info('Reading MSMS data from h5 files')
-        samples = parallel_process(
-            ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample"
+        logger.info("Reading MSMS data from h5 files")
+        samples = parallel.parallel_process(
+            ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample", spread_args=False
         )
         self._data = tuple(MetatlasSample(x) for x in samples)
         logger.info(
@@ -484,7 +487,7 @@ def atlas_df(self):
         """atlas_df getter, update ._atlas_df if necessary"""
         if not self._atlas_df_valid:
             start_time = datetime.datetime.now()
-            logger.info('Generating atlas_df')
+            logger.info("Generating atlas_df")
             self._atlas_df = ma_data.make_atlas_df(self.atlas)
             self._atlas_df_valid = True
             logger.info(
@@ -737,7 +740,12 @@ def _files_dict(self):
         return file_dict
 
     @property
-    def groups_dataframe(self):
+    def groups(self):
+        """This needs to be updated to only return the currently selected groups"""
+        return self.all_groups
+
+    @property
+    def all_groups_dataframe(self):
         """Returns pandas Dataframe with one row per file"""
         out = pd.DataFrame(self._files_dict).T
         if out.empty:
@@ -747,15 +755,15 @@ def groups_dataframe(self):
         return out.reset_index()
 
     @property
-    def groups(self):
+    def all_groups(self):
         """Returns a list of Group objects"""
-        if self._groups_valid:
-            return self._groups
+        if self._all_groups_valid:
+            return self._all_groups
         file_dict = self._files_dict
-        self._groups = []
-        unique_groups = self.groups_dataframe[["group", "short_name"]].drop_duplicates()
+        self._all_groups = []
+        unique_groups = self.all_groups_dataframe[["group", "short_name"]].drop_duplicates()
         for values in unique_groups.to_dict("index").values():
-            self._groups.append(
+            self._all_groups.append(
                 metob.Group(
                     name=values["group"],
                     short_name=values["short_name"],
@@ -766,10 +774,10 @@ def groups(self):
                     ],
                 )
             )
-        self._groups_valid = True
-        return self._groups
+        self._all_groups_valid = True
+        return self._all_groups
 
-    def store_groups(self, exist_ok=False):
+    def store_all_groups(self, exist_ok=False):
         """
         Save self.object_list to DB
         inputs:
@@ -778,7 +786,7 @@ def store_groups(self, exist_ok=False):
         """
         if not exist_ok:
             db_names = {group.name for group in self.existing_groups}
-            new_names = set(self.groups_dataframe["group"].to_list())
+            new_names = set(self.all_groups_dataframe["group"].to_list())
             overlap = db_names.intersection(new_names)
             try:
                 if overlap:
@@ -789,8 +797,8 @@ def store_groups(self, exist_ok=False):
             except ValueError as err:
                 logger.exception(err)
                 raise err
-        logger.debug("Storing %d groups in the database", len(self.groups))
-        metob.store(self.groups)
+        logger.debug("Storing %d groups in the database", len(self.all_groups))
+        metob.store(self.all_groups)
 
     def compound_idxs_not_evaluated(self):
         """NOT YET IMPLEMENTED"""
@@ -832,9 +840,9 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         logger.info("extra_time set to 0.5 minutes for output generation.")
         targeted_output.write_atlas_to_spreadsheet(self, overwrite)
         targeted_output.write_stats_table(self, overwrite)
-        targeted_output.write_chromatograms(self, overwrite)
+        targeted_output.write_chromatograms(self, overwrite, max_cpus=self.max_cpus)
         targeted_output.write_identification_figure(self, overwrite)
-        targeted_output.write_metrics_and_boxplots(self, overwrite)
+        targeted_output.write_metrics_and_boxplots(self, overwrite, max_cpus=self.max_cpus)
         if msms_fragment_ions:
             targeted_output.write_msms_fragment_ions(self, overwrite)
         logger.info("Generation of output files completed sucessfully.")
@@ -948,21 +956,3 @@ def get_atlas(name, username):
 def quoted_string_list(strings):
     """Adds double quotes around each string and seperates with ', '."""
     return ", ".join([f'"{x}"' for x in strings])
-
-
-def parallel_process(function, data, max_cpus, unit=None):
-    """
-    performs map(function, data) using multiprocessing module but
-    adds a progress bar and bypasses multiprocessing in the 1 cpu case as this makes debugging easier
-    inputs:
-        function: the function to apply
-        data: iterater containing the inputs to function
-        max_cpus: number of cpus to use
-        unit: string label for what is processed in one iteration, default 'it'
-    """
-    if max_cpus > 1 and len(data) > 1:
-        logger.debug('Starting parallel processing of %s with %d cpus.', function.__name__, max_cpus)
-        with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
-            return list(tqdm(pool.imap(function, data), total=len(data), unit=unit))
-    logger.debug('Processing of %s with 1 cpu.', function.__name__)
-    return [function(i) for i in data]
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 8294ea82..dd7f2e55 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -62,7 +62,7 @@ def write_stats_table(
         metatlas_dataset.ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv"
     )
     _ = metatlas_dataset.hits  # regenerate hits if needed before logging about scores
-    logger.info('Calculating scores and exporting them to %s.', scores_path)
+    logger.info("Calculating scores and exporting them to %s.", scores_path)
     scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)
     scores_df["passing"] = fa.test_scores_df(
         scores_df,
@@ -91,7 +91,7 @@ def write_stats_table(
     )
 
 
-def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwrite=False):
+def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwrite=False, max_cpus=1):
     """
     inputs:
         metatlas_dataset: a MetatlasDataset instance
@@ -112,6 +112,7 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
         short_names_header="short_samplename",
         polarity=metatlas_dataset.ids.short_polarity,
         overwrite=overwrite,
+        max_cpus=max_cpus,
     )
 
 
@@ -131,7 +132,7 @@ def write_identification_figure(metatlas_dataset, overwrite=False):
     )
 
 
-def write_metrics_and_boxplots(metatlas_dataset, overwrite=False):
+def write_metrics_and_boxplots(metatlas_dataset, overwrite=False, max_cpus=1):
     """
     Save metrics dataframes as csv and boxplots as PDF.
     Will not overwrite existing file unless overwrite is True
@@ -161,7 +162,7 @@ def write_metrics_and_boxplots(metatlas_dataset, overwrite=False):
                 metatlas_dataset.ids.output_dir,
                 f"{prefix}boxplot_{fields['name']}",
             )
-            dp.make_boxplot_plots(dataframe, output_loc=plot_dir, ylabel=fields["label"], overwrite=overwrite)
+            dp.make_boxplot_plots(dataframe, plot_dir, fields["label"], overwrite, max_cpus)
 
 
 Max = namedtuple("Max", ["file_idx", "pre_intensity_idx", "pre_intensity", "precursor_mz"])
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index c860367d..4d779592 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -9,14 +9,13 @@
 # os.environ['R_LIBS_USER'] = '/project/projectdirs/metatlas/r_pkgs/'
 # curr_ld_lib_path = ''
 
-
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 from metatlas.io import write_utils
-from metatlas.tools import spectralprocessing as sp
-from metatlas.plots import chromplotplus as cpp
 from metatlas.io.metatlas_get_data_helper_fun import extract
-# from metatlas import gui
+from metatlas.plots import chromplotplus as cpp
+from metatlas.tools import parallel
+from metatlas.tools import spectralprocessing as sp
 
 from tqdm.notebook import tqdm
 from textwrap import fill, TextWrapper
@@ -1601,33 +1600,35 @@ def plot_errorbar_plots(df,output_loc='', use_shortnames=True, ylabel=""):
         plt.close(f)#f.clear()
 
 
-def make_boxplot_plots(df, output_loc='', use_shortnames=True, ylabel="", overwrite=True):
+def make_boxplot_plots(df, output_loc='', use_shortnames=True, ylabel="", overwrite=True, max_cpus=1):
     output_loc = os.path.expandvars(output_loc)
     plt.ioff()
-    for compound in df.index:
-        f, ax = plt.subplots(1, 1,figsize=(12,12))
-        if use_shortnames and 'short groupname' in df.columns.names:
-            g = df.loc[compound].groupby(level='short groupname')
-            g.apply(pd.DataFrame).plot(kind='box',ax=ax)
-        else:
-            g = df.loc[compound].groupby(level='group')
-            g.apply(pd.DataFrame).plot(kind='box',ax=ax)
+    args = [(compound, df, output_loc, use_shortnames, ylabel, overwrite) for compound in df.index]
+    parallel.parallel_process(make_boxplot, args, max_cpus, unit='plot')
 
-        for i, (n, grp) in enumerate(g):
-            x = [i+1] *len(grp)
-            x = np.random.normal(x, 0.04, size=len(x))
-            plt.scatter(x, grp)
-        ax.set_title(compound,fontsize=12,weight='bold')
-        plt.xticks(rotation=90)
-        if ylabel != "":
-            plt.ylabel(ylabel)
-        plt.tight_layout()
-        fig_path = os.path.join(output_loc, compound + '_boxplot.pdf')
-        write_utils.check_existing_file(fig_path, overwrite)
-        f.savefig(fig_path)
-        #f.clear()
-        plt.close(f)#f.clear()
-        logger.info('Exported box plot of %s for %s at %s.', ylabel, compound, fig_path)
+
+def make_boxplot(compound, df, output_loc, use_shortnames, ylabel, overwrite):
+    f, ax = plt.subplots(1, 1,figsize=(12,12))
+    if use_shortnames and 'short groupname' in df.columns.names:
+        g = df.loc[compound].groupby(level='short groupname')
+        g.apply(pd.DataFrame).plot(kind='box',ax=ax)
+    else:
+        g = df.loc[compound].groupby(level='group')
+        g.apply(pd.DataFrame).plot(kind='box',ax=ax)
+    for i, (n, grp) in enumerate(g):
+        x = [i+1] *len(grp)
+        x = np.random.normal(x, 0.04, size=len(x))
+        plt.scatter(x, grp)
+    ax.set_title(compound,fontsize=12,weight='bold')
+    plt.xticks(rotation=90)
+    if ylabel != "":
+        plt.ylabel(ylabel)
+    plt.tight_layout()
+    fig_path = os.path.join(output_loc, compound + '_boxplot.pdf')
+    write_utils.check_existing_file(fig_path, overwrite)
+    f.savefig(fig_path)
+    plt.close(f)
+    logger.info('Exported box plot of %s for %s at %s.', ylabel, compound, fig_path)
 
 
 def frag_refs_to_json(json_dir = '/project/projectdirs/metatlas/projects/sharepoint/', name = 'frag_refs', save = True):
@@ -2187,7 +2188,7 @@ def get_msms_hits_with_warnings(metatlas_dataset, extra_time=False, keep_nonmatc
     return msms_hits
 
 
-def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], group='index', share_y=True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity='', overwrite=False):
+def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], group='index', share_y=True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity='', overwrite=False, max_cpus=1):
     input_dataset = filter_runs(input_dataset, include_lcmsruns, include_groups,
                                 exclude_lcmsruns, exclude_groups)
     file_names = ma_data.get_file_names(input_dataset)
@@ -2223,9 +2224,7 @@ def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[
                   'shortname': short_names_df,
                   'overwrite': overwrite}
         args_list.append(kwargs)
-    max_processes = 4
-    with mp.Pool(processes=min(max_processes, len(input_dataset[0]))) as pool:
-        pool.map(cpp.chromplotplus, args_list)
+    parallel.parallel_process(cpp.chromplotplus, args_list, max_cpus, unit='plot', spread_args=False)
 
 
 def make_identification_figure_v2(input_fname='', input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[],
@@ -2664,7 +2663,6 @@ def strong_signal_compound_idxs(data, num_points_passing, peak_height_passing):
 
 
 def filter_metatlas_objects_to_most_recent(object_list,field):
-    #from datetime import datetime, date
     #remove from list if another copy exists that is newer
     unique_values = []
     for i,a in enumerate(object_list):
@@ -2681,17 +2679,15 @@ def filter_metatlas_objects_to_most_recent(object_list,field):
                     old_last_modified = last_modified
         keep_object_list.append(keep_object)
     return keep_object_list
-#        print i, a.name,  datetime.utcfromtimestamp(a.last_modified)
+
 
 def get_metatlas_atlas(name = '%%',username = '*', most_recent = True,do_print = True):
-    from datetime import datetime, date
     atlas = metob.retrieve('Atlas',name = name,username=username)
     if most_recent:
         atlas = filter_metatlas_objects_to_most_recent(atlas,'name')
     if do_print:
         for i,a in enumerate(atlas):
             print((i, len(a.compound_identifications),a.name,  datetime.utcfromtimestamp(a.last_modified)))
-
     return atlas
 
 class interact_get_metatlas_files():
@@ -3157,27 +3153,24 @@ def filter_compounds_in_dataset_by_include_list(metatlas_dataset,include_list):
         filtered_dataset.append(filtered_row)
     return filtered_dataset
 
-def select_groups_for_analysis(name = '%', description = [], username = '*', do_print = True, most_recent = True, remove_empty = True, include_list = [], exclude_list = []):
+
+def select_groups_for_analysis(name='%', description=[], username='*', do_print=True, most_recent=True,
+                               remove_empty=True, include_list=[], exclude_list=[]):
     if description:
-        groups = metob.retrieve('Groups', name = name, description = description, username=username)
+        groups = metob.retrieve('Groups', name=name, description=description, username=username)
     else:
-        groups = metob.retrieve('Groups', name = name, username=username)
+        groups = metob.retrieve('Groups', name=name, username=username)
     if most_recent:
-        groups = filter_metatlas_objects_to_most_recent(groups,'name')
-
+        groups = filter_metatlas_objects_to_most_recent(groups, 'name')
     if include_list:
-        groups = filter_metatlas_objects_by_list(groups,'name',include_list)
-
+        groups = filter_metatlas_objects_by_list(groups, 'name', include_list)
     if exclude_list:
-        groups = remove_metatlas_objects_by_list(groups,'name',exclude_list)
-
+        groups = remove_metatlas_objects_by_list(groups, 'name', exclude_list)
     if remove_empty:
-        groups = filter_empty_metatlas_objects(groups,'items')
+        groups = filter_empty_metatlas_objects(groups, 'items')
     if do_print:
-        from datetime import datetime, date
-        for i,a in enumerate(groups):
-            print((i, a.name,  datetime.utcfromtimestamp(a.last_modified)))
-
+        for i, group in enumerate(groups):
+            print((i, group.name,  datetime.utcfromtimestamp(group.last_modified)))
     return groups
 
 
diff --git a/metatlas/tools/parallel.py b/metatlas/tools/parallel.py
new file mode 100644
index 00000000..d9d09c23
--- /dev/null
+++ b/metatlas/tools/parallel.py
@@ -0,0 +1,29 @@
+"""Functions for multi-CPU execution"""
+
+import itertools
+import logging
+import multiprocessing
+from tqdm.notebook import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def parallel_process(function, data, max_cpus, unit=None, spread_args=True):
+    """
+    performs map(function, data) using multiprocessing module but
+    adds a progress bar and bypasses multiprocessing in the 1 cpu case as this makes debugging easier
+    inputs:
+        function: the function to apply
+        data: iterater containing the inputs to function as list of tuples, one tuple per call
+        max_cpus: number of cpus to use
+        unit: string label for what is processed in one iteration, default 'it'
+        spread_args: if True, function takes more than one argument and the tuples in data need to be spread
+    """
+    if max_cpus > 1 and len(data) > 1:
+        logger.debug("Starting parallel processing of %s with %d cpus.", function.__name__, max_cpus)
+        with multiprocessing.Pool(processes=min(max_cpus, len(data))) as pool:
+            map_fun = pool.starmap if spread_args else pool.imap
+            return list(tqdm(map_fun(function, data), total=len(data), unit=unit))
+    logger.debug("Processing of %s with 1 cpu.", function.__name__)
+    map_fun = itertools.starmap if spread_args else map
+    return list(map_fun(function, data))
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 9929c35c..2a66c30c 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -24,6 +24,7 @@
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 from metatlas.tools import notebook
+from metatlas.tools import parallel
 
 logger = logging.getLogger(__name__)
 
@@ -195,7 +196,9 @@ def load_runs(files_df, qc_atlas_df, qc_atlas, cpus):
     """
     files = [(i[1].file, i[1].group, qc_atlas_df, qc_atlas) for i in files_df.iterrows()]
     logger.info("Loading LCMS data files")
-    return mads.parallel_process(ma_data.get_data_for_atlas_df_and_file, files, cpus, unit="sample")
+    return parallel.parallel_process(
+        ma_data.get_data_for_atlas_df_and_file, files, cpus, unit="sample", spread_args=False
+    )
 
 
 def save_measured_rts(metatlas_dataset, file_name):
diff --git a/noxfile.py b/noxfile.py
index 6a934218..9eef89ce 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -27,6 +27,7 @@
     "metatlas/tools/logging.py",
     "metatlas/tools/notebook.py",
     "metatlas/tools/predict_rt.py",
+    "metatlas/tools/parallel.py",
     "tests",
 ]
 
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 7a044ad6..7c79f0ed 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -569,7 +569,7 @@ def test_lcmsruns_dataframe(metatlas_dataset):
 def test_store_groups01(metatlas_dataset, mocker):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
     mocker.patch("metatlas.datastructures.metatlas_objects.store")
-    metatlas_dataset.store_groups()
+    metatlas_dataset.store_all_groups()
     assert metob.store.called  # pylint: disable=no-member
 
 
@@ -582,7 +582,7 @@ def group():
     )
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[group])
     with pytest.raises(ValueError):
-        metatlas_dataset.store_groups()
+        metatlas_dataset.store_all_groups()
 
 
 def test_annotation_gui01(metatlas_dataset, hits, mocker):

From e1af8115b1f341020e69deed0675cb4725537cc6 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 30 Jul 2021 11:47:17 -0700
Subject: [PATCH 061/177] WIP - moving groups into AnalysisIds

---
 .gitignore                                  |   3 +
 metatlas/datastructures/metatlas_dataset.py | 358 +++++++++++---------
 metatlas/io/targeted_output.py              |   6 +-
 metatlas/tools/predict_rt.py                |   7 +-
 notebooks/reference/RT_Prediction.ipynb     |   4 +-
 notebooks/reference/Targeted.ipynb          |   6 +-
 tests/unit/conftest.py                      |  19 +-
 tests/unit/test_metatlas_dataset.py         |  10 +-
 8 files changed, 218 insertions(+), 195 deletions(-)

diff --git a/.gitignore b/.gitignore
index e1365ba5..18e38d3a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ scratch/
 # editor swap files
 .*.swp
 .vscode/
+
+# pyenv
+.python-version
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 5aa5e671..fac1f835 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -41,6 +41,10 @@ def __init__(
         analysis_number,
         project_directory,
         username=None,
+        groups_controlled_vocab=None,
+        include_groups=None,
+        exclude_groups=None,
+        exclude_files=None,
     ):
         self._source_atlas = source_atlas
         self._experiment = experiment
@@ -49,6 +53,22 @@ def __init__(
         self._analysis_number = analysis_number
         self._username = getpass.getuser() if username is None else username
         self.project_directory = project_directory
+        self._runs = None
+        self._runs_valid = False
+        self._all_groups = None
+        self._all_groups_valid = False
+        self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
+        if include_groups is None and output_type == "data_QC":
+            self._include_groups = ["QC"]
+        self._include_groups = [] if include_groups is None else include_groups
+        self._exclude_groups = [] if exclude_groups is None else exclude_groups
+        if exclude_groups is None:
+            self._exclude_groups = ["InjBl", "InjBL"]
+        if polarity == "positive":
+            self._exclude_groups.append("NEG")
+        elif polarity == "negative":
+            self._exclude_groups.append("POS")
+        self._exclude_files = [] if exclude_files is None else exclude_files
         self.validate()
         logger.info(
             "IDs: source_atlas=%s, atlas=%s, short_experiment_analysis=%s, output_dir=%s",
@@ -57,6 +77,7 @@ def __init__(
             self.short_experiment_analysis,
             self.output_dir,
         )
+        self.store_all_groups(exist_ok=True)
 
     def validate(self):
         """Valid class inputs"""
@@ -69,8 +90,6 @@ def validate(self):
             raise ValueError(f"Parameter output_type is not one of: {quoted_string_list(OUTPUT_TYPES)}.")
         if self.polarity not in POLARITIES:
             raise ValueError(f"Parameter polarity is not one of: {quoted_string_list(POLARITIES)}.")
-        if not isinstance(self.analysis_number, numbers.Integral):
-            raise TypeError("Parameter analysis_number is not an integer.")
         if self.analysis_number < 0:
             raise ValueError("Parameter analysis_number cannot be negative.")
         logger.debug("Inputs to AnalysisIdentifiers passed validation.")
@@ -147,6 +166,173 @@ def output_dir(self):
         os.makedirs(out, exist_ok=True)
         return out
 
+    @property
+    def exclude_files(self):
+        return self._exclude_files
+
+    @property
+    def lcmsruns(self):
+        """Get LCMS runs from DB matching experiment"""
+        if self._runs_valid:
+            return self._runs
+        self._runs = dp.get_metatlas_files(experiment=self.experiment, name="%")
+        self._runs_valid = True
+        for run in self._runs:
+            logger.info("Run: %s", run.name)
+        logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(self._runs))
+        return self._runs
+
+    @property
+    def lcmsruns_dataframe(self):
+        """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
+        return metob.to_dataframe(self.lcmsruns)
+
+    def get_lcmsruns_short_names(self, fields=None):
+        """
+        Querys DB for lcms filenames from self.experiment and returns
+        a pandas DataFrame containing identifiers for each file
+        inputs:
+            fields: optional dict with column names as key
+                    and list of lcms filename metadata fields positions as value
+        """
+        if fields is None:
+            fields = {
+                "full_filename": range(16),
+                "sample_treatment": [12],
+                "short_filename": [0, 2, 4, 5, 7, 9, 14],
+                "short_samplename": [9, 12, 13, 14],
+            }
+        out = pd.DataFrame(columns=fields.keys())
+        for i, lcms_file in enumerate(self.lcmsruns):
+            tokens = lcms_file.name.split(".")[0].split("_")
+            for name, idxs in fields.items():
+                out.loc[i, name] = "_".join([tokens[n] for n in idxs])
+            out.loc[i, "last_modified"] = pd.to_datetime(lcms_file.last_modified, unit="s")
+        if out.empty:
+            return out
+        out.sort_values(by="last_modified", inplace=True)
+        out.drop(columns=["last_modified"], inplace=True)
+        out.drop_duplicates(subset=["full_filename"], keep="last", inplace=True)
+        out.set_index("full_filename", inplace=True)
+        return out.sort_values(by="full_filename")
+
+    lcmsruns_short_names = property(get_lcmsruns_short_names)
+
+    def write_lcmsruns_short_names(self):
+        """Write short names and raise error if exists and differs from current data"""
+        short_names = self.lcmsruns_short_names
+        short_names["full_filename"] = short_names.index
+        write_utils.export_dataframe_die_on_diff(
+            short_names,
+            os.path.join(self.output_dir, "short_names.csv"),
+            "LCMS runs short names",
+            index=False,
+        )
+
+    @property
+    def _files_dict(self):
+        """
+        Queries DB for all lcmsruns matching the class properties.
+        Returns a dict of dicts where keys are filenames minus extensions and values are
+        dicts with keys: object, group, and short_name
+        """
+        file_dict = {}
+        for lcms_file in self.lcmsruns:
+            if not any(map(lcms_file.name.__contains__, self._exclude_files)):
+                base_name = lcms_file.name.split(".")[0]
+                file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
+        return file_dict
+
+    @property
+    def groups(self):
+        """This needs to be updated to only return the currently selected groups"""
+        return self.all_groups
+
+    @property
+    def existing_groups(self):
+        """Get your own groups that are prefixed by self.experiment"""
+        return metob.retrieve("Groups", name=f"{self.experiment}%{self.analysis}_%", username=self.username)
+
+    def group_name(self, base_filename):
+        """Returns dict with keys group and short_name corresponding to base_filename"""
+        indices = [
+            i for i, s in enumerate(self._groups_controlled_vocab) if s.lower() in base_filename.lower()
+        ]
+        tokens = base_filename.split("_")
+        prefix = "_".join(tokens[:11])
+        suffix = self._groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
+        group_name = f"{prefix}_{self.analysis}_{suffix}"
+        short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
+        return {"group": group_name, "short_name": short_name}
+
+    @property
+    def groups_controlled_vocab(self):
+        return self._groups_controlled_vocab
+
+    @property
+    def include_groups(self):
+        return self._include_groups
+
+    @property
+    def exclude_groups(self):
+        return self._exclude_groups
+
+    @property
+    def all_groups_dataframe(self):
+        """Returns pandas Dataframe with one row per file"""
+        out = pd.DataFrame(self._files_dict).T
+        if out.empty:
+            return out
+        out.drop(columns=["object"], inplace=True)
+        out.index.name = "filename"
+        return out.reset_index()
+
+    @property
+    def all_groups(self):
+        """Returns a list of Group objects"""
+        if self._all_groups_valid:
+            return self._all_groups
+        file_dict = self._files_dict
+        self._all_groups = []
+        unique_groups = self.all_groups_dataframe[["group", "short_name"]].drop_duplicates()
+        for values in unique_groups.to_dict("index").values():
+            self._all_groups.append(
+                metob.Group(
+                    name=values["group"],
+                    short_name=values["short_name"],
+                    items=[
+                        file_value["object"]
+                        for file_value in file_dict.values()
+                        if file_value["group"] == values["group"]
+                    ],
+                )
+            )
+        self._all_groups_valid = True
+        return self._all_groups
+
+    def store_all_groups(self, exist_ok=False):
+        """
+        Save self.object_list to DB
+        inputs:
+            exist_ok: if False, store nothing and raise ValueError if any of the group names
+                      have already been saved to the DB by you.
+        """
+        if not exist_ok:
+            db_names = {group.name for group in self.existing_groups}
+            new_names = set(self.all_groups_dataframe["group"].to_list())
+            overlap = db_names.intersection(new_names)
+            try:
+                if overlap:
+                    raise ValueError(
+                        "Not saving groups as you have already saved groups with these names: %s."
+                        % ", ".join(overlap),
+                    )
+            except ValueError as err:
+                logger.exception(err)
+                raise err
+        logger.debug("Storing %d groups in the database", len(self.all_groups))
+        metob.store(self.all_groups)
+
 
 class MetatlasDataset:
     """
@@ -173,8 +359,6 @@ class MetatlasDataset:
     def __init__(
         self,
         ids,
-        groups_controlled_vocab=None,
-        exclude_files=None,
         extra_time=0.75,
         extra_mz=0,
         keep_nonmatches=True,
@@ -197,17 +381,11 @@ def __init__(
         self._atlas_valid = False
         self._atlas_df = None
         self._atlas_df_valid = False
-        self._runs = None
-        self._runs_valid = False
         self._data = None
         self._data_valid = False
         self._hits = None
         self._hits_valid = False  # based on all hits dependencies except RT min/max values
         self._hits_valid_for_rt_bounds = False  # based only on RT min/max changes
-        self._all_groups = None
-        self._all_groups_valid = False
-        self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
-        self._exclude_files = [] if exclude_files is None else exclude_files
         self._extra_time = extra_time
         self._extra_mz = extra_mz
         self._keep_nonmatches = keep_nonmatches
@@ -219,8 +397,7 @@ def __init__(
         if save_metadata:
             logger.debug("Writing MetatlasDataset metadata files")
             self.write_data_source_files()
-            self.write_lcmsruns_short_names()
-        self.store_all_groups(exist_ok=True)
+            self.ids.write_lcmsruns_short_names()
 
     def write_data_source_files(self):
         """Write the data source files if they don't already exist"""
@@ -236,20 +413,9 @@ def write_data_source_files(self):
             shutil.rmtree(data_sources_dir, ignore_errors=True)
             logger.info("Writing data source files to %s.", data_sources_dir)
             ma_data.make_data_sources_tables(
-                self.groups, self.atlas, self.ids.output_dir, self.ids.short_polarity
+                self.ids.groups, self.atlas, self.ids.output_dir, self.ids.short_polarity
             )
 
-    def write_lcmsruns_short_names(self):
-        """Write short names and raise error if exists and differs from current data"""
-        short_names = self.lcmsruns_short_names
-        short_names["full_filename"] = short_names.index
-        write_utils.export_dataframe_die_on_diff(
-            short_names,
-            os.path.join(self.ids.output_dir, "short_names.csv"),
-            "LCMS runs short names",
-            index=False,
-        )
-
     def _get_atlas(self):
         """Copy source atlas from database into current analysis atlas"""
         atlases = metob.retrieve("Atlas", name=self.ids.atlas, username=self.ids.username)
@@ -288,7 +454,7 @@ def _build(self):
         """Populate self._data from database and h5 files."""
         start_time = datetime.datetime.now()
         files = []
-        for group in self.groups:
+        for group in self.ids.groups:
             for h5_file in group.items:
                 files.append(
                     (
@@ -658,148 +824,6 @@ def compound_indices_marked_remove(self):
         ids = ["identification", "ms1_notes"]
         return [i for i, j in enumerate(self.data[0]) if _is_remove(ma_data.extract(j, ids))]
 
-    @property
-    def lcmsruns(self):
-        """Get LCMS runs from DB matching experiment"""
-        if self._runs_valid:
-            return self._runs
-        self._runs = dp.get_metatlas_files(experiment=self.ids.experiment, name="%")
-        self._runs_valid = True
-        for run in self._runs:
-            logger.info("Run: %s", run.name)
-        logger.info("Number of LCMS output files matching '%s' is: %d.", self.ids.experiment, len(self._runs))
-        return self._runs
-
-    @property
-    def existing_groups(self):
-        """Get your own groups that are prefixed by self.experiment"""
-        return metob.retrieve(
-            "Groups", name=f"{self.ids.experiment}%{self.ids.analysis}_%", username=self.ids.username
-        )
-
-    @property
-    def lcmsruns_dataframe(self):
-        """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
-        return metob.to_dataframe(self.lcmsruns)
-
-    def get_lcmsruns_short_names(self, fields=None):
-        """
-        Querys DB for lcms filenames from self.experiment and returns
-        a pandas DataFrame containing identifiers for each file
-        inputs:
-            fields: optional dict with column names as key
-                    and list of lcms filename metadata fields positions as value
-        """
-        if fields is None:
-            fields = {
-                "full_filename": range(16),
-                "sample_treatment": [12],
-                "short_filename": [0, 2, 4, 5, 7, 9, 14],
-                "short_samplename": [9, 12, 13, 14],
-            }
-        out = pd.DataFrame(columns=fields.keys())
-        for i, lcms_file in enumerate(self.lcmsruns):
-            tokens = lcms_file.name.split(".")[0].split("_")
-            for name, idxs in fields.items():
-                out.loc[i, name] = "_".join([tokens[n] for n in idxs])
-            out.loc[i, "last_modified"] = pd.to_datetime(lcms_file.last_modified, unit="s")
-        if out.empty:
-            return out
-        out.sort_values(by="last_modified", inplace=True)
-        out.drop(columns=["last_modified"], inplace=True)
-        out.drop_duplicates(subset=["full_filename"], keep="last", inplace=True)
-        out.set_index("full_filename", inplace=True)
-        return out.sort_values(by="full_filename")
-
-    lcmsruns_short_names = property(get_lcmsruns_short_names)
-
-    def group_name(self, base_filename):
-        """Returns dict with keys group and short_name corresponding to base_filename"""
-        indices = [
-            i for i, s in enumerate(self._groups_controlled_vocab) if s.lower() in base_filename.lower()
-        ]
-        tokens = base_filename.split("_")
-        prefix = "_".join(tokens[:11])
-        suffix = self._groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
-        group_name = f"{prefix}_{self.ids.analysis}_{suffix}"
-        short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
-        return {"group": group_name, "short_name": short_name}
-
-    @property
-    def _files_dict(self):
-        """
-        Queries DB for all lcmsruns matching the class properties.
-        Returns a dict of dicts where keys are filenames minus extensions and values are
-        dicts with keys: object, group, and short_name
-        """
-        file_dict = {}
-        for lcms_file in self.lcmsruns:
-            if not any(map(lcms_file.name.__contains__, self._exclude_files)):
-                base_name = lcms_file.name.split(".")[0]
-                file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
-        return file_dict
-
-    @property
-    def groups(self):
-        """This needs to be updated to only return the currently selected groups"""
-        return self.all_groups
-
-    @property
-    def all_groups_dataframe(self):
-        """Returns pandas Dataframe with one row per file"""
-        out = pd.DataFrame(self._files_dict).T
-        if out.empty:
-            return out
-        out.drop(columns=["object"], inplace=True)
-        out.index.name = "filename"
-        return out.reset_index()
-
-    @property
-    def all_groups(self):
-        """Returns a list of Group objects"""
-        if self._all_groups_valid:
-            return self._all_groups
-        file_dict = self._files_dict
-        self._all_groups = []
-        unique_groups = self.all_groups_dataframe[["group", "short_name"]].drop_duplicates()
-        for values in unique_groups.to_dict("index").values():
-            self._all_groups.append(
-                metob.Group(
-                    name=values["group"],
-                    short_name=values["short_name"],
-                    items=[
-                        file_value["object"]
-                        for file_value in file_dict.values()
-                        if file_value["group"] == values["group"]
-                    ],
-                )
-            )
-        self._all_groups_valid = True
-        return self._all_groups
-
-    def store_all_groups(self, exist_ok=False):
-        """
-        Save self.object_list to DB
-        inputs:
-            exist_ok: if False, store nothing and raise ValueError if any of the group names
-                      have already been saved to the DB by you.
-        """
-        if not exist_ok:
-            db_names = {group.name for group in self.existing_groups}
-            new_names = set(self.all_groups_dataframe["group"].to_list())
-            overlap = db_names.intersection(new_names)
-            try:
-                if overlap:
-                    raise ValueError(
-                        "Not saving groups as you have already saved groups with these names: %s."
-                        % ", ".join(overlap),
-                    )
-            except ValueError as err:
-                logger.exception(err)
-                raise err
-        logger.debug("Storing %d groups in the database", len(self.all_groups))
-        metob.store(self.all_groups)
-
     def compound_idxs_not_evaluated(self):
         """NOT YET IMPLEMENTED"""
         for compound_idx, _ in enumerate(self.data[0]):
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index dd7f2e55..99fea613 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -108,7 +108,7 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
         share_y=share_y,
         save=True,
         output_loc=metatlas_dataset.ids.output_dir,
-        short_names_df=metatlas_dataset.lcmsruns_short_names,
+        short_names_df=metatlas_dataset.ids.lcmsruns_short_names,
         short_names_header="short_samplename",
         polarity=metatlas_dataset.ids.short_polarity,
         overwrite=overwrite,
@@ -126,7 +126,7 @@ def write_identification_figure(metatlas_dataset, overwrite=False):
         include_lcmsruns=[],
         exclude_lcmsruns=["InjBl", "QC", "Blank", "blank"],
         output_loc=metatlas_dataset.ids.output_dir,
-        short_names_df=metatlas_dataset.lcmsruns_short_names,
+        short_names_df=metatlas_dataset.ids.lcmsruns_short_names,
         polarity=metatlas_dataset.ids.short_polarity,
         overwrite=overwrite,
     )
@@ -152,7 +152,7 @@ def write_metrics_and_boxplots(metatlas_dataset, overwrite=False, max_cpus=1):
             fieldname=fields["name"],
             input_dataset=metatlas_dataset,
             output_loc=df_dir,
-            short_names_df=metatlas_dataset.lcmsruns_short_names,
+            short_names_df=metatlas_dataset.ids.lcmsruns_short_names,
             polarity=metatlas_dataset.ids.short_polarity,
             use_labels=True,
             overwrite=overwrite,
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 2a66c30c..9272bcc8 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -90,9 +90,6 @@ def predict(self, x_values):
 
 def generate_rt_correction_models(
     ids,
-    groups_controlled_vocab,
-    exclude_files,
-    include_groups,
     cpus,
     repo_dir,
     save_to_db=True,
@@ -115,8 +112,8 @@ def generate_rt_correction_models(
                         are pre-populated into the generated notebooks
     """
     # pylint: disable=too-many-locals
-    metatlas_dataset = mads.MetatlasDataset(ids, groups_controlled_vocab, exclude_files, save_metadata=False)
-    groups = get_groups(metatlas_dataset, include_groups)
+    metatlas_dataset = mads.MetatlasDataset(ids, save_metadata=False)
+    groups = get_groups(metatlas_dataset, ids.include_groups)
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 4e9d4191..6560378a 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -122,9 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "predict_rt.generate_rt_correction_models(\n",
-    "    ids, groups_controlled_vocab, exclude_files, include_groups, max_cpus, metatlas_repo_path\n",
-    ")"
+    "predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)",
    ]
   }
  ],
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 093e55ef..3baf8ea8 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -157,7 +157,7 @@
    "outputs": [],
    "source": [
     "ids = mads.AnalysisIdentifiers(\n",
-    "    source_atlas, experiment, output_type, polarity, analysis_number, project_directory\n",
+    "    source_atlas, experiment, output_type, polarity, analysis_number, project_directory, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files\n",
     ")"
    ]
   },
@@ -167,9 +167,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metatlas_dataset = mads.MetatlasDataset(\n",
-    "    ids, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files, max_cpus=max_cpus\n",
-    ")"
+    "metatlas_dataset = mads.MetatlasDataset(ids, max_cpus=max_cpus)\n"
    ]
   },
   {
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index c979e30d..938e2794 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -28,7 +28,8 @@ def fixture_username():
 
 
 @pytest.fixture(name="analysis_ids")
-def fixture_analysis_ids(sqlite_with_atlas, username):
+def fixture_analysis_ids(sqlite_with_atlas, username, lcmsrun, mocker, groups_controlled_vocab):
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     return mads.AnalysisIdentifiers(
         f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
         "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -36,11 +37,15 @@ def fixture_analysis_ids(sqlite_with_atlas, username):
         "positive",
         0,
         str(os.getcwd()),
+        groups_controlled_vocab=groups_controlled_vocab,
     )
 
 
 @pytest.fixture(name="analysis_ids_with_2_cids")
-def fixture_analysis_ids_with_2_cids(sqlite_with_atlas_with_2_cids, username):
+def fixture_analysis_ids_with_2_cids(
+    sqlite_with_atlas_with_2_cids, username, lcmsrun, mocker, groups_controlled_vocab
+):
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     return mads.AnalysisIdentifiers(
         f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1",
         "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
@@ -48,6 +53,7 @@ def fixture_analysis_ids_with_2_cids(sqlite_with_atlas_with_2_cids, username):
         "positive",
         0,
         str(os.getcwd()),
+        groups_controlled_vocab=groups_controlled_vocab,
     )
 
 
@@ -534,14 +540,12 @@ def fixture_groups_controlled_vocab():
 
 
 @pytest.fixture(name="metatlas_dataset")
-def fixture_metatlas_dataset(
-    mocker, df_container, analysis_ids, groups_controlled_vocab, lcmsrun, sqlite_with_atlas
-):
+def fixture_metatlas_dataset(mocker, df_container, analysis_ids, lcmsrun, sqlite_with_atlas):
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids, groups_controlled_vocab, save_metadata=False)
+    return mads.MetatlasDataset(analysis_ids, save_metadata=False)
 
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
@@ -549,7 +553,6 @@ def fixture_metatlas_dataset_with_2_cids(
     mocker,
     df_container,
     analysis_ids_with_2_cids,
-    groups_controlled_vocab,
     lcmsrun,
     sqlite_with_atlas_with_2_cids,
 ):
@@ -557,7 +560,7 @@ def fixture_metatlas_dataset_with_2_cids(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids_with_2_cids, groups_controlled_vocab, save_metadata=False)
+    return mads.MetatlasDataset(analysis_ids_with_2_cids, save_metadata=False)
 
 
 @pytest.fixture(name="eic")
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 7c79f0ed..7438158f 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -305,7 +305,7 @@ def test_atlas_setter02(metatlas_dataset):
 
 
 def test_groups01(metatlas_dataset):
-    assert metatlas_dataset.groups[0].short_name == "POS_Cone-S1"
+    assert metatlas_dataset.ids.groups[0].short_name == "POS_Cone-S1"
 
 
 def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
@@ -559,17 +559,17 @@ def test_get_atlas04(metatlas_dataset, username):
 def test_existing_groups(mocker, metatlas_dataset):
     """This test has little value, but is needed for coverage"""
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
-    assert metatlas_dataset.existing_groups == []
+    assert metatlas_dataset.ids.existing_groups == []
 
 
 def test_lcmsruns_dataframe(metatlas_dataset):
-    assert metatlas_dataset.lcmsruns_dataframe.shape == (1, 15)
+    assert metatlas_dataset.ids.lcmsruns_dataframe.shape == (1, 15)
 
 
 def test_store_groups01(metatlas_dataset, mocker):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
     mocker.patch("metatlas.datastructures.metatlas_objects.store")
-    metatlas_dataset.store_all_groups()
+    metatlas_dataset.ids.store_all_groups()
     assert metob.store.called  # pylint: disable=no-member
 
 
@@ -582,7 +582,7 @@ def group():
     )
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[group])
     with pytest.raises(ValueError):
-        metatlas_dataset.store_all_groups()
+        metatlas_dataset.ids.store_all_groups()
 
 
 def test_annotation_gui01(metatlas_dataset, hits, mocker):

From 704073739abf4dcccf954492b9511d16980733a7 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 30 Jul 2021 12:53:38 -0700
Subject: [PATCH 062/177] Log git commit when notebook starts

---
 metatlas/tools/environment.py | 12 ++++++++++++
 metatlas/tools/notebook.py    |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/metatlas/tools/environment.py b/metatlas/tools/environment.py
index d6a9363d..4f11451f 100644
--- a/metatlas/tools/environment.py
+++ b/metatlas/tools/environment.py
@@ -12,6 +12,7 @@
 import os
 import re
 import shutil
+import subprocess
 import sys
 
 from pathlib import Path
@@ -163,3 +164,14 @@ def validate_data_dir(base_data_dir, experiment_id):
     except FileNotFoundError as err:
         logger.exception(err)
         raise err
+
+
+def get_repo_hash():
+    """
+    Returns the full hash for the current git commit or 'git not found, hash unknown'
+    """
+    try:
+        result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=repo_dir(), capture_output=True, check=True)
+    except FileNotFoundError:
+        return "git not found, hash unknown"
+    return result.stdout.strip()
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 2cf87fcb..03d45bff 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -9,6 +9,7 @@
 from metatlas.tools.logging import activate_logging
 from metatlas.tools.logging import activate_module_logging
 from metatlas.tools.environment import validate_kernel
+from metatlas.tools.environment import get_repo_hash
 
 
 logger = logging.getLogger(__name__)
@@ -24,6 +25,7 @@ def configure_environment(log_level):
     logger.debug("Running import and environment setup block of notebook.")
     logger.debug("Configuring notebook environment with console log level of %s.", log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+    logger.info('Running on git commit: %s', get_repo_hash())
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):

From 4629429687b26e9d921bfe028eb8d22e49a67946 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 2 Aug 2021 20:21:57 -0700
Subject: [PATCH 063/177] Minor clean up in dill2plots

---
 metatlas/plots/dill2plots.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 4d779592..0647f70d 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -2662,18 +2662,18 @@ def strong_signal_compound_idxs(data, num_points_passing, peak_height_passing):
     return np.flatnonzero(num_passing & peak_passing).tolist()
 
 
-def filter_metatlas_objects_to_most_recent(object_list,field):
-    #remove from list if another copy exists that is newer
+def filter_metatlas_objects_to_most_recent(object_list, field):
+    # remove from list if another copy exists that is newer
     unique_values = []
-    for i,a in enumerate(object_list):
-        unique_values.append( getattr(a,field) )
+    for a in object_list:
+        unique_values.append(getattr(a, field))
     unique_values = list(set(unique_values))
     keep_object_list = []
     for u in unique_values:
         old_last_modified = 0
-        for i,a in enumerate(object_list):
-            if getattr(a,field) == u:
-                last_modified = getattr(a,'last_modified')
+        for a in object_list:
+            if getattr(a, field) == u:
+                last_modified = getattr(a, 'last_modified')
                 if last_modified > old_last_modified:
                     keep_object = a
                     old_last_modified = last_modified
@@ -3074,7 +3074,7 @@ def filter_empty_metatlas_objects(object_list,field):
 def filter_metatlas_objects_by_list(object_list, field, filter_list):
     """
     inputs:
-        object_list: iterable to be filtered by its attribute values
+        object_list: list to be filtered by its attribute values
         field: name of attribute to filter on
         filter_list: strings that are tested to see if they are substrings of the attribute value
     returns filtered list of objects that have a match in filter_list

From 74be79430066ceb0510ef8327acf1db5af24d129 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 4 Aug 2021 15:20:47 -0700
Subject: [PATCH 064/177] Group handling/selection now in AnalysisIds

Moved to using traitlet observers to do mark
invalid data
---
 metatlas/datastructures/metatlas_dataset.py | 459 +++++++++-----------
 metatlas/tools/notebook.py                  |   2 +-
 notebooks/reference/RT_Prediction.ipynb     |   2 +-
 notebooks/reference/Targeted.ipynb          |  11 +-
 tests/unit/conftest.py                      |  28 +-
 tests/unit/test_metatlas_dataset.py         | 152 ++++---
 6 files changed, 307 insertions(+), 347 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index fac1f835..8b77821a 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -3,7 +3,6 @@
 import getpass
 import glob
 import logging
-import numbers
 import os
 import shutil
 import tarfile
@@ -11,6 +10,9 @@
 import humanize
 import pandas as pd
 
+from traitlets import HasTraits, TraitError, default, observe, validate
+from traitlets import Bool, Float, Instance, Int, List, Tuple, Unicode
+
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
@@ -20,56 +22,41 @@
 from metatlas.tools import parallel
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "InjBL", "ISTD"]
+OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
 SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
 
-OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
-
 logger = logging.getLogger(__name__)
 
 
-class AnalysisIdentifiers:
+class AnalysisIdentifiers(HasTraits):
     """Names used in generating an analysis"""
 
-    # pylint: disable=too-many-arguments
-    def __init__(
-        self,
-        source_atlas,
-        experiment,
-        output_type,
-        polarity,
-        analysis_number,
-        project_directory,
-        username=None,
-        groups_controlled_vocab=None,
-        include_groups=None,
-        exclude_groups=None,
-        exclude_files=None,
-    ):
-        self._source_atlas = source_atlas
-        self._experiment = experiment
-        self._output_type = output_type
-        self._polarity = polarity
-        self._analysis_number = analysis_number
-        self._username = getpass.getuser() if username is None else username
-        self.project_directory = project_directory
-        self._runs = None
-        self._runs_valid = False
-        self._all_groups = None
-        self._all_groups_valid = False
-        self._groups_controlled_vocab = [] if groups_controlled_vocab is None else groups_controlled_vocab
-        if include_groups is None and output_type == "data_QC":
-            self._include_groups = ["QC"]
-        self._include_groups = [] if include_groups is None else include_groups
-        self._exclude_groups = [] if exclude_groups is None else exclude_groups
-        if exclude_groups is None:
-            self._exclude_groups = ["InjBl", "InjBL"]
-        if polarity == "positive":
-            self._exclude_groups.append("NEG")
-        elif polarity == "negative":
-            self._exclude_groups.append("POS")
-        self._exclude_files = [] if exclude_files is None else exclude_files
-        self.validate()
+    source_atlas = Unicode(allow_none=True)
+    experiment = Unicode()
+    output_type = Unicode()
+    polarity = Unicode(default_value="positive")
+    analysis_number = Int(default_value=0)
+    username = Unicode(default_value=getpass.getuser())
+    project_directory = Unicode()
+    exclude_files = List(trait=Unicode(), allow_none=True, default_value=[])
+    groups_controlled_vocab = List(
+        trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
+    )
+    include_groups = List(allow_none=True, default_value=None)
+    exclude_groups = List(allow_none=True, default_value=["InjBl", "InjBL"])
+    _lcmsruns = List(allow_none=True, default_value=None)
+    _all_groups = List(allow_none=True, default_value=None)
+    _groups = List(allow_none=True, default_value=None)
+
+    # pylint: disable=no-self-use
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.polarity == "positive":
+            self.exclude_groups.append("NEG")
+        elif self.polarity == "negative":
+            self.exclude_groups.append("POS")
         logger.info(
             "IDs: source_atlas=%s, atlas=%s, short_experiment_analysis=%s, output_dir=%s",
             self.source_atlas,
@@ -79,45 +66,44 @@ def __init__(
         )
         self.store_all_groups(exist_ok=True)
 
-    def validate(self):
-        """Valid class inputs"""
-        logger.debug("Validating inputs to AnalysisIdentifiers")
-        if self._source_atlas is not None:
-            get_atlas(self.source_atlas, self.username)  # will raise error if not found or matches multiple
-        if len(self.experiment.split("_")) != 9:
-            raise ValueError('Parameter experiment does contain 9 fields when split on "_".')
-        if self.output_type not in OUTPUT_TYPES:
-            raise ValueError(f"Parameter output_type is not one of: {quoted_string_list(OUTPUT_TYPES)}.")
-        if self.polarity not in POLARITIES:
-            raise ValueError(f"Parameter polarity is not one of: {quoted_string_list(POLARITIES)}.")
-        if self.analysis_number < 0:
-            raise ValueError("Parameter analysis_number cannot be negative.")
-        logger.debug("Inputs to AnalysisIdentifiers passed validation.")
-
-    @property
-    def source_atlas(self):
-        """Returns source atlas identifier"""
-        return self._source_atlas
-
-    @property
-    def experiment(self):
-        """Returns experiment identifier"""
-        return self._experiment
+    @default("include_groups")
+    def _default_include_groups(self):
+        if self.output_type == "data_QC":
+            return ["QC"]
+        return []
 
-    @property
-    def output_type(self):
-        """Returns output type identifier"""
-        return self._output_type
+    @validate("polarity")
+    def _valid_polarity(self, proposal):
+        if proposal["value"] not in POLARITIES:
+            raise TraitError(f"Parameter polarity must be one of {', '.join(POLARITIES)}")
+        return proposal["value"]
+
+    @validate("output_type")
+    def _valid_output_type(self, proposal):
+        if proposal["value"] not in OUTPUT_TYPES:
+            raise TraitError(f"Parameter output_type must be one of {', '.join(OUTPUT_TYPES)}")
+        return proposal["value"]
+
+    @validate("source_atlas")
+    def _valid_source_atlas(self, proposal):
+        if proposal["value"] is not None:
+            try:
+                get_atlas(proposal["value"], self.username)  # raises error if not found or matches multiple
+            except ValueError as err:
+                raise TraitError(str(err)) from err
+        return proposal["value"]
 
-    @property
-    def polarity(self):
-        """Returns polarity identifier"""
-        return self._polarity
+    @validate("analysis_number")
+    def _valid_analysis_number(self, proposal):
+        if proposal["value"] < 0:
+            raise TraitError("Parameter analysis_number cannot be negative.")
+        return proposal["value"]
 
-    @property
-    def analysis_number(self):
-        """Returns analysis number"""
-        return self._analysis_number
+    @validate("experiment")
+    def _valid_experiment(self, proposal):
+        if len(proposal["value"].split("_")) != 9:
+            raise TraitError('Parameter experiment does contain 9 fields when split on "_".')
+        return proposal["value"]
 
     @property
     def _exp_tokens(self):
@@ -134,11 +120,6 @@ def atlas(self):
         """Atlas identifier (name)"""
         return f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
 
-    @property
-    def username(self):
-        """Returns username identifier"""
-        return self._username
-
     @property
     def analysis(self):
         """Analysis identifier"""
@@ -166,21 +147,23 @@ def output_dir(self):
         os.makedirs(out, exist_ok=True)
         return out
 
-    @property
-    def exclude_files(self):
-        return self._exclude_files
-
     @property
     def lcmsruns(self):
         """Get LCMS runs from DB matching experiment"""
-        if self._runs_valid:
-            return self._runs
-        self._runs = dp.get_metatlas_files(experiment=self.experiment, name="%")
-        self._runs_valid = True
-        for run in self._runs:
+        if self._lcmsruns is not None:
+            return self._lcmsruns
+        all_lcmsruns = dp.get_metatlas_files(experiment=self.experiment, name="%")
+        self._lcmsruns = [r for r in all_lcmsruns if not any(map(r.name.__contains__, self.exclude_files))]
+        if len(self.exclude_files) > 0:
+            logger.info(
+                "Excluding %d LCMS runs containing any of: %s",
+                len(all_lcmsruns) - len(self._lcmsruns),
+                self.exclude_files,
+            )
+        for run in self._lcmsruns:
             logger.info("Run: %s", run.name)
-        logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(self._runs))
-        return self._runs
+        logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(self._lcmsruns))
+        return self._lcmsruns
 
     @property
     def lcmsruns_dataframe(self):
@@ -238,15 +221,52 @@ def _files_dict(self):
         """
         file_dict = {}
         for lcms_file in self.lcmsruns:
-            if not any(map(lcms_file.name.__contains__, self._exclude_files)):
-                base_name = lcms_file.name.split(".")[0]
-                file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
+            base_name = lcms_file.name.split(".")[0]
+            file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
         return file_dict
 
     @property
     def groups(self):
-        """This needs to be updated to only return the currently selected groups"""
-        return self.all_groups
+        """Return the currently selected groups"""
+        if self._groups is not None:
+            return self._groups
+        out = dp.filter_metatlas_objects_to_most_recent(self.all_groups, "name")
+        if len(self.include_groups) > 0:
+            out = dp.filter_metatlas_objects_by_list(out, "name", self.include_groups)
+        if len(self.exclude_groups) > 0:
+            out = dp.remove_metatlas_objects_by_list(out, "name", self.exclude_groups)
+        self._groups = dp.filter_empty_metatlas_objects(out, "items")
+        return self._groups
+
+    @observe("all_groups")
+    def _observe_all_groups(self, signal):
+        if signal.type == "change":
+            self._groups = None
+            logger.debug("Change to all_groups invalidates groups")
+
+    @observe("groups_controlled_vocab")
+    def _observe_groups_controlled_vocab(self, signal):
+        if signal.type == "change":
+            self._lcmsruns = None
+            logger.debug("Change to groups_controlled_vocab invalidates lcmsruns")
+
+    @observe("include_groups")
+    def _observe_include_groups(self, signal):
+        if signal.type == "change":
+            self._groups = None
+            logger.debug("Change to include_groups invalidates groups")
+
+    @observe("exclude_groups")
+    def _observe_exclude_groups(self, signal):
+        if signal.type == "change":
+            self._groups = None
+            logger.debug("Change to exclude_groups invalidates groups")
+
+    @observe("exclude_files")
+    def _observe_exclude_files(self, signal):
+        if signal.type == "change":
+            self._lcmsruns = None
+            logger.debug("Change to exclude_files invalidates lcmsruns")
 
     @property
     def existing_groups(self):
@@ -256,27 +276,15 @@ def existing_groups(self):
     def group_name(self, base_filename):
         """Returns dict with keys group and short_name corresponding to base_filename"""
         indices = [
-            i for i, s in enumerate(self._groups_controlled_vocab) if s.lower() in base_filename.lower()
+            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in base_filename.lower()
         ]
         tokens = base_filename.split("_")
         prefix = "_".join(tokens[:11])
-        suffix = self._groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
+        suffix = self.groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
         group_name = f"{prefix}_{self.analysis}_{suffix}"
         short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
         return {"group": group_name, "short_name": short_name}
 
-    @property
-    def groups_controlled_vocab(self):
-        return self._groups_controlled_vocab
-
-    @property
-    def include_groups(self):
-        return self._include_groups
-
-    @property
-    def exclude_groups(self):
-        return self._exclude_groups
-
     @property
     def all_groups_dataframe(self):
         """Returns pandas Dataframe with one row per file"""
@@ -290,9 +298,8 @@ def all_groups_dataframe(self):
     @property
     def all_groups(self):
         """Returns a list of Group objects"""
-        if self._all_groups_valid:
+        if self._all_groups is not None:
             return self._all_groups
-        file_dict = self._files_dict
         self._all_groups = []
         unique_groups = self.all_groups_dataframe[["group", "short_name"]].drop_duplicates()
         for values in unique_groups.to_dict("index").values():
@@ -302,12 +309,11 @@ def all_groups(self):
                     short_name=values["short_name"],
                     items=[
                         file_value["object"]
-                        for file_value in file_dict.values()
+                        for file_value in self._files_dict.values()
                         if file_value["group"] == values["group"]
                     ],
                 )
             )
-        self._all_groups_valid = True
         return self._all_groups
 
     def store_all_groups(self, exist_ok=False):
@@ -334,7 +340,7 @@ def store_all_groups(self, exist_ok=False):
         metob.store(self.all_groups)
 
 
-class MetatlasDataset:
+class MetatlasDataset(HasTraits):
     """
     Like the non-object oriented metatlas_dataset, you can index into this class by file_idx and compound_idx:
     metatlas_dataset = MetatlasDataset(analysis_ids)
@@ -353,48 +359,34 @@ class MetatlasDataset:
     MetatlasDataset also has methods for updating RT values and identification notes while keeping
     the atlas, atlas_df, metatlas_dataset, and database in sync. This removes the need to do kernel
     restarts between steps in the workflow.
+
+
+    ids: AnalysisIdentifiers instance defining the analysis
+    save_metadata: if True, write metadata files containing data sources and LCMS runs short name
     """
 
-    # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods
-    def __init__(
-        self,
-        ids,
-        extra_time=0.75,
-        extra_mz=0,
-        keep_nonmatches=True,
-        frag_mz_tolerance=0.01,
-        msms_refs_loc=MSMS_REFS_PATH,
-        max_cpus=1,
-        save_metadata=True,
-    ):
-        """
-        inputs:
-            ids: AnalysisIdentifiers instance defining the analysis
-            groups_controlled_vocab: array of strings that will group together when creating groups
-                                     application of groups_controlled_vocab is case insensitive
-            exclude_files: array of strings that will exclude files if they are substrings of the filename
-            save_metadata: if True, write metadata files containing data sources and LCMS runs short name
-        """
+    extra_time = Float(default_value=0.75)
+    extra_mz = Float(default_value=0)
+    frag_mz_tolerance = Float(default_value=0.01)
+    max_cpus = Int(default_value=1)
+    save_metadata = Bool(default_value=True)
+    keep_nonmatches = Bool(default_value=True)
+    msms_refs_loc = Unicode(default_value=MSMS_REFS_PATH)
+    ids = Instance(klass=AnalysisIdentifiers)
+    atlas = Instance(klass=metob.Atlas, allow_none=True, default_value=None)
+    _atlas_df = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
+    _data = Tuple(allow_none=True, default_value=None)
+    _hits = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
+
+    # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods, no-self-use
+    def __init__(self, **kwargs):
+        """Constructor"""
+        super().__init__(**kwargs)
         logger.debug("Creating new MetatlasDataset instance...")
-        self.ids = ids
-        self._atlas = None
-        self._atlas_valid = False
-        self._atlas_df = None
-        self._atlas_df_valid = False
-        self._data = None
-        self._data_valid = False
-        self._hits = None
-        self._hits_valid = False  # based on all hits dependencies except RT min/max values
         self._hits_valid_for_rt_bounds = False  # based only on RT min/max changes
-        self._extra_time = extra_time
-        self._extra_mz = extra_mz
-        self._keep_nonmatches = keep_nonmatches
-        self._frag_mz_tolerance = frag_mz_tolerance
-        self._msms_refs_loc = msms_refs_loc
-        self.max_cpus = max_cpus
-        if ids.source_atlas is not None:
+        if self.ids.source_atlas is not None:
             self._get_atlas()
-        if save_metadata:
+        if self.save_metadata:
             logger.debug("Writing MetatlasDataset metadata files")
             self.write_data_source_files()
             self.ids.write_lcmsruns_short_names()
@@ -428,8 +420,7 @@ def _get_atlas(self):
                 self.ids.atlas,
                 self.ids.source_atlas,
             )
-            self._atlas = atlases[0]
-            self._atlas_valid = True
+            self.atlas = atlases[0]
         elif len(atlases) > 1:
             try:
                 raise ValueError(
@@ -445,9 +436,8 @@ def _get_atlas(self):
             logger.info("Retriving source atlas: %s", self.ids.source_atlas)
             source = get_atlas(self.ids.source_atlas, self.ids.username)
             logger.info("Cloning source atlas")
-            self._atlas = source.clone()
-            self._atlas.name = self.ids.atlas
-            self._atlas_valid = True
+            self.atlas = source.clone()
+            self.atlas.name = self.ids.atlas
             self.store_atlas()
 
     def _build(self):
@@ -479,12 +469,12 @@ def _build(self):
 
     def _remove_compound_id(self, idx):
         """
-        Remove compound identification at index idx from both in db and self._atlas
+        Remove compound identification at index idx from both in db and self.atlas
         Does not invalidate _data or _hits or _atlas_df
         This bypasses several ORM layers and therefore is a hack, but I couldn't get it to work with the ORM.
         """
-        cid_id = self._atlas.compound_identifications[idx].unique_id
-        atlas_id = self._atlas.unique_id
+        cid_id = self.atlas.compound_identifications[idx].unique_id
+        atlas_id = self.atlas.unique_id
         link_table = "atlases_compound_identifications"
         target = f"target_id='{cid_id}'"
         workspace = metob.Workspace.get_instance()
@@ -496,7 +486,7 @@ def _remove_compound_id(self, idx):
             if len(list(links)) == 0:  # other atlases are not linked to this CompoundIdentification
                 workspace.db.query(f"delete from compoundidentifications where unique_id='{cid_id}'")
             workspace.db.commit()
-            del self._atlas.compound_identifications[idx]
+            del self.atlas.compound_identifications[idx]
         except Exception as err:  # pylint: disable=broad-except
             metoh.rollback_and_log(workspace.db, err)
         workspace.close_connection()
@@ -521,14 +511,13 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None):
             _error_if_bad_idxs(self.atlas_df, remove_idxs)
             keep_idxs = self.atlas_df.index.difference(remove_idxs)
         self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
-        self._atlas_df_valid = True
-        if self._data_valid:
+        if self._data is not None:
             self._data = [
                 [compound for idx, compound in enumerate(sample) if idx in keep_idxs] for sample in self._data
             ]
         if remove_idxs is None:
             remove_idxs = [
-                idx for idx, _ in enumerate(self._atlas.compound_identifications) if idx not in keep_idxs
+                idx for idx, _ in enumerate(self.atlas.compound_identifications) if idx not in keep_idxs
             ]
         _ = [self._remove_compound_id(idx) for idx in sorted(remove_idxs, reverse=True)]
         logger.info(
@@ -537,7 +526,7 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None):
             len(self.atlas_df),
             start_len - len(self.atlas_df),
         )
-        if self._hits_valid:
+        if self._hits is not None:
             self.filter_hits_by_atlas()
 
     def filter_hits_by_atlas(self):
@@ -621,41 +610,20 @@ def __getitem__(self, idx):
         """get sample at idx"""
         return self.data[idx]
 
-    def _set_and_invalidate_properties(self, attribute_name, new_value, property_names):
-        """
-        inputs:
-            attribute_name: name of the class attribute being modified
-            new_value: value to assign to attribute
-            property_names: list of names of the class properties that are dependent on the attribute's value
-        side effects:
-            If the property is valid and new_value is different from previous value, then invalidate.
-            And set attribute to new_value
-        """
-        for prop in property_names:
-            valid_attr_name = f"_{prop}_valid"
-            setattr(
-                self,
-                valid_attr_name,
-                getattr(self, valid_attr_name) and new_value == getattr(self, attribute_name),
-            )
-        setattr(self, f"_{attribute_name}", new_value)
-
     @property
     def data(self):
         """data getter, update ._data if necessary"""
-        if not self._data_valid:
+        if self._data is None:
             self._build()
-            self._data_valid = True
         return self._data
 
     @property
     def atlas_df(self):
         """atlas_df getter, update ._atlas_df if necessary"""
-        if not self._atlas_df_valid:
+        if self._atlas_df is None:
             start_time = datetime.datetime.now()
             logger.info("Generating atlas_df")
             self._atlas_df = ma_data.make_atlas_df(self.atlas)
-            self._atlas_df_valid = True
             logger.info(
                 "Generated atlas_df with %d rows in %s.",
                 len(self.atlas_df),
@@ -663,17 +631,12 @@ def atlas_df(self):
             )
         return self._atlas_df
 
-    @property
-    def atlas(self):
-        """atlas getter"""
-        return self._atlas
-
-    @atlas.setter
-    def atlas(self, atlas):
-        """atlas setter, invalidate atlas_df and data"""
-        if not isinstance(atlas, metob.Atlas):
-            raise TypeError("Cannot set atlas to contain a non-Atlas object")
-        self._set_and_invalidate_properties("atlas", atlas, ["atlas_df", "data"])
+    @observe("atlas")
+    def _observe_atlas(self, signal):
+        if signal.type == "change":
+            self._atlas_df = None
+            self._data = None
+            logger.debug("Change to atlas invalidates atlas_df, data")
 
     @property
     def polarity(self):
@@ -687,60 +650,42 @@ def polarity(self):
             return "positive"
         return cid.mz_references[0].detected_polarity
 
-    @property
-    def extra_time(self):
-        """extra_time getter"""
-        return self._extra_time
-
-    @extra_time.setter
-    def extra_time(self, extra_time):
-        """extra_time setter, invalidates data and hits"""
-        self._set_and_invalidate_properties("extra_time", extra_time, ["data", "hits"])
-
-    @property
-    def extra_mz(self):
-        """extra_mz getter"""
-        return self._extra_mz
-
-    @extra_mz.setter
-    def extra_mz(self, extra_mz):
-        """extra_mz setter, invalidates data and hits"""
-        self._set_and_invalidate_properties("extra_mz", extra_mz, ["data", "hits"])
-
-    @property
-    def keep_nonmatches(self):
-        """keep_nonmatches getter"""
-        return self._keep_nonmatches
-
-    @keep_nonmatches.setter
-    def keep_nonmatches(self, keep_nonmatches):
-        """keep_nonmatches setter, invalidates hits"""
-        self._set_and_invalidate_properties("keep_nonmatches", keep_nonmatches, ["hits"])
-
-    @property
-    def frag_mz_tolerance(self):
-        """frag_mz_tolerance getter"""
-        return self._frag_mz_tolerance
-
-    @frag_mz_tolerance.setter
-    def frag_mz_tolerance(self, frag_mz_tolerance):
-        """frag_mz_tolerance setter, invlidates hits"""
-        self._set_and_invalidate_properties("frag_mz_tolerance", frag_mz_tolerance, ["hits"])
-
-    @property
-    def msms_refs_loc(self):
-        """msms_refs_loc getter"""
-        return self._msms_refs_loc
-
-    @msms_refs_loc.setter
-    def msms_refs_loc(self, msms_refs_loc):
-        """msms_refs_loc setter, invalidates hits"""
-        self._set_and_invalidate_properties("msms_refs_loc", msms_refs_loc, ["hits"])
+    @observe("extra_time")
+    def _observe_extra_time(self, signal):
+        if signal.type == "change":
+            self._hits = None
+            self._data = None
+            logger.debug("Change to extra_time invalidates hits, data")
+
+    @observe("extra_mz")
+    def _observe_extra_mz(self, signal):
+        if signal.type == "change":
+            self._hits = None
+            self._data = None
+            logger.debug("Change to extra_mz invalidates hits, data")
+
+    @observe("keep_nonmatches")
+    def _observe_keep_nonmatches(self, signal):
+        if signal.type == "change":
+            self._hits = None
+            logger.debug("Change to keep_nonmatches invalidates hits")
+
+    @observe("frag_mz_tolerance")
+    def _observe_frag_mz_tolerance(self, signal):
+        if signal.type == "change":
+            self._hits = None
+            logger.debug("Change to frag_mz_tolerance invalidates hits")
+
+    @observe("msms_refs_loc")
+    def _observe_msms_refs_loc(self, signal):
+        if signal.type == "change":
+            self._hits = None
+            logger.debug("Change to msms_refs_loc invalidates hits")
 
     @property
     def hits(self):
         """get msms hits DataFrame"""
-        if not self._hits_valid:
+        if self._hits is None:
             logger.info(
                 "Generating hits with extra_time=%.3f, frag_mz_tolerance=%.4f, msms_refs_loc=%s.",
                 self.extra_time,
@@ -756,7 +701,6 @@ def hits(self):
                 ref_loc=self.msms_refs_loc,
             )
             logger.info("Generated %d hits in %s.", len(self._hits), _duration_since(start_time))
-            self._hits_valid = True
             self._hits_valid_for_rt_bounds = True
         return self._hits
 
@@ -766,10 +710,9 @@ def __len__(self):
 
     def set_data(self, ids, value):
         """update a value within self._data"""
-        if not self._data_valid:
+        if self._data is None:
             self._build()
-            self._data_valid = True
-        self._atlas_df_valid = False
+        self._atlas_df = None
         _set_nested(self._data, ids, value)
 
     @property
@@ -859,7 +802,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             overwrite: if False, throw error if any output files already exist
         """
         if not self._hits_valid_for_rt_bounds:
-            self._hits_valid = False  # force hits to be regenerated
+            self._hits = None  # force hits to be regenerated
         self.extra_time = 0.5
         logger.info("extra_time set to 0.5 minutes for output generation.")
         targeted_output.write_atlas_to_spreadsheet(self, overwrite)
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 03d45bff..1668fea2 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -25,7 +25,7 @@ def configure_environment(log_level):
     logger.debug("Running import and environment setup block of notebook.")
     logger.debug("Configuring notebook environment with console log level of %s.", log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
-    logger.info('Running on git commit: %s', get_repo_hash())
+    logger.info("Running on git commit: %s", get_repo_hash())
 
 
 def configure_pandas_display(max_rows=5000, max_columns=500, max_colwidth=100):
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 6560378a..9383dc26 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -122,7 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)",
+    "predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)"
    ]
   }
  ],
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 3baf8ea8..d567aa03 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -157,7 +157,14 @@
    "outputs": [],
    "source": [
     "ids = mads.AnalysisIdentifiers(\n",
-    "    source_atlas, experiment, output_type, polarity, analysis_number, project_directory, groups_controlled_vocab=groups_controlled_vocab, exclude_files=exclude_files\n",
+    "    source_atlas=source_atlas,\n",
+    "    experiment=experiment,\n",
+    "    output_type=output_type,\n",
+    "    polarity=polarity,\n",
+    "    analysis_number=analysis_number,\n",
+    "    project_directory=project_directory,\n",
+    "    groups_controlled_vocab=groups_controlled_vocab,\n",
+    "    exclude_files=exclude_files,\n",
     ")"
    ]
   },
@@ -167,7 +174,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metatlas_dataset = mads.MetatlasDataset(ids, max_cpus=max_cpus)\n"
+    "metatlas_dataset = mads.MetatlasDataset(ids=ids, max_cpus=max_cpus)"
    ]
   },
   {
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 938e2794..cf970e80 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -31,12 +31,12 @@ def fixture_username():
 def fixture_analysis_ids(sqlite_with_atlas, username, lcmsrun, mocker, groups_controlled_vocab):
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     return mads.AnalysisIdentifiers(
-        f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-        "FinalEMA-HILIC",
-        "positive",
-        0,
-        str(os.getcwd()),
+        source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        output_type="FinalEMA-HILIC",
+        polarity="positive",
+        analysis_number=0,
+        project_directory=str(os.getcwd()),
         groups_controlled_vocab=groups_controlled_vocab,
     )
 
@@ -47,12 +47,12 @@ def fixture_analysis_ids_with_2_cids(
 ):
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     return mads.AnalysisIdentifiers(
-        f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1",
-        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-        "FinalEMA-HILIC",
-        "positive",
-        0,
-        str(os.getcwd()),
+        source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}1",
+        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        output_type="FinalEMA-HILIC",
+        polarity="positive",
+        analysis_number=0,
+        project_directory=str(os.getcwd()),
         groups_controlled_vocab=groups_controlled_vocab,
     )
 
@@ -545,7 +545,7 @@ def fixture_metatlas_dataset(mocker, df_container, analysis_ids, lcmsrun, sqlite
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids, save_metadata=False)
+    return mads.MetatlasDataset(ids=analysis_ids, save_metadata=False)
 
 
 @pytest.fixture(name="metatlas_dataset_with_2_cids")
@@ -560,7 +560,7 @@ def fixture_metatlas_dataset_with_2_cids(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    return mads.MetatlasDataset(analysis_ids_with_2_cids, save_metadata=False)
+    return mads.MetatlasDataset(ids=analysis_ids_with_2_cids, save_metadata=False)
 
 
 @pytest.fixture(name="eic")
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 7438158f..b61c2e1b 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 import pytest
+import traitlets
 
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
@@ -26,12 +27,12 @@ def test_metatlas_dataset_build01(metatlas_dataset):
 
 
 @pytest.mark.xfail
-def test_metatlas_dataset_build02(mocker, atlas, group_with_2_lcmsruns, df_container):
+def test_metatlas_dataset_build02(mocker, atlas, group_with_2_lcmsruns, df_container, analysis_ids):
     # need to mock multiprocessing for this to work
     mocker.patch(
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
-    metatlas_dataset = mads.MetatlasDataset(atlas, [group_with_2_lcmsruns], max_cpus=2)
+    metatlas_dataset = mads.MetatlasDataset(ids=analysis_ids, max_cpus=2)
     assert len(metatlas_dataset) == 2
     assert len(metatlas_dataset[0]) == 1
 
@@ -110,11 +111,11 @@ def test_polarity(metatlas_dataset):
 def test_extra_time_setter(metatlas_dataset, hits, mocker):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.hits  # pylint: disable=pointless-statement
-    assert metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is not None
     metatlas_dataset.extra_time = 0.3
-    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is None
     metatlas_dataset.hits  # pylint: disable=pointless-statement
-    assert metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is not None
 
 
 def test_rts01(metatlas_dataset):
@@ -124,7 +125,7 @@ def test_rts01(metatlas_dataset):
 
 
 def test_rts02(metatlas_dataset):
-    metatlas_dataset._atlas_df_valid = False
+    metatlas_dataset._atlas_df = None
     metatlas_dataset.set_rt(0, "rt_max", 9.99)
     assert metatlas_dataset.rts[0].rt_max == 9.99
     assert len(metatlas_dataset.rts) == 1
@@ -133,7 +134,7 @@ def test_rts02(metatlas_dataset):
 def test_rts03(metatlas_dataset, analysis_ids):
     assert metatlas_dataset.rts[0].rt_max != 9.99
     metatlas_dataset.set_rt(0, "rt_max", 9.99)
-    second_metatlas_dataset = mads.MetatlasDataset(analysis_ids)
+    second_metatlas_dataset = mads.MetatlasDataset(ids=analysis_ids)
     assert second_metatlas_dataset.rts[0].rt_max == 9.99
     assert len(second_metatlas_dataset.rts) == 1
 
@@ -143,12 +144,12 @@ def test_rts04(analysis_ids, sqlite_with_atlas, mocker, lcmsrun, df_container):
         "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
-    first = mads.MetatlasDataset(analysis_ids)
+    first = mads.MetatlasDataset(ids=analysis_ids)
     first.set_rt(0, "rt_max", 1.11)
-    second = mads.MetatlasDataset(analysis_ids)
+    second = mads.MetatlasDataset(ids=analysis_ids)
     assert second.rts[0].rt_max == 1.11
     second.set_rt(0, "rt_max", 2.22)
-    third = mads.MetatlasDataset(analysis_ids)
+    third = mads.MetatlasDataset(ids=analysis_ids)
     assert third.rts[0].rt_max == 2.22
 
 
@@ -158,7 +159,7 @@ def test_set_note01(metatlas_dataset, sqlite):
 
 
 def test_set_note02(metatlas_dataset):
-    metatlas_dataset._atlas_df_valid = False
+    metatlas_dataset._atlas_df = None
     metatlas_dataset.set_note(0, "ms1_notes", "keeper")
     assert metatlas_dataset[0][0]["identification"].ms1_notes == "keeper"
 
@@ -295,12 +296,12 @@ def test_export_atlas_to_csv01(metatlas_dataset, tmp_path):
 def test_atlas_setter01(metatlas_dataset, atlas_with_2_cids):
     metatlas_dataset.data  # pylint: disable=pointless-statement
     metatlas_dataset.atlas = atlas_with_2_cids
-    assert not metatlas_dataset._data_valid
+    assert metatlas_dataset._data is None
     assert len(metatlas_dataset[0]) == 2
 
 
 def test_atlas_setter02(metatlas_dataset):
-    with pytest.raises(TypeError):
+    with pytest.raises(traitlets.traitlets.TraitError):
         metatlas_dataset.atlas = [1, 2]
 
 
@@ -313,8 +314,8 @@ def test_set_extra_mz_setter(metatlas_dataset, mocker, hits):
     metatlas_dataset.data  # pylint: disable=pointless-statement
     metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.extra_mz = 0.43
-    assert not metatlas_dataset._data_valid
-    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset._data is None
+    assert metatlas_dataset._hits is None
     assert metatlas_dataset.extra_mz == 0.43
 
 
@@ -322,7 +323,7 @@ def test_set_keep_nonmatches_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.keep_nonmatches = False
-    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is None
     assert not metatlas_dataset.keep_nonmatches
 
 
@@ -330,7 +331,7 @@ def test_set_frag_mz_tolerance_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.frag_mz_tolerance = 1e-4
-    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is None
     assert metatlas_dataset.frag_mz_tolerance == 1e-4
 
 
@@ -338,7 +339,7 @@ def test_set_msms_refs_loc_setter(metatlas_dataset, mocker, hits):
     mocker.patch("metatlas.plots.dill2plots.get_msms_hits", return_value=hits)
     metatlas_dataset.hits  # pylint: disable=pointless-statement
     metatlas_dataset.msms_refs_loc = "/tmp/some_file.tab"
-    assert not metatlas_dataset._hits_valid
+    assert metatlas_dataset._hits is None
     assert metatlas_dataset.msms_refs_loc == "/tmp/some_file.tab"
 
 
@@ -411,91 +412,100 @@ def test_store_atlas07(atlas, sqlite, username):
 
 
 def test_analysis_identifiers01(sqlite):
-    with pytest.raises(ValueError, match=r"Database does not contain an atlas.*"):
+    with pytest.raises(traitlets.traitlets.TraitError, match=r"Database does not contain an atlas.*"):
         mads.AnalysisIdentifiers(
-            "source_atlas_name_not_valid",
-            "experiment_not_valid",
-            "output_type_not_valid",
-            "polarity_not_valid",
-            "analysis_number_not_valid",
-            "/foo/bar",
+            source_atlas="Not_A_Real_Atlas_Name",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="FinalEMA-HILIC",
+            polarity="positive",
+            analysis_number=1,
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers02(sqlite_with_atlas, username):
     with pytest.raises(
-        ValueError, match='Parameter output_type is not one of: "ISTDsEtc", "FinalEMA-HILIC".'
+        traitlets.traitlets.TraitError,
+        match="Parameter output_type must be one of ISTDsEtc, FinalEMA-HILIC, data_QC",
     ):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-            "output_type_not_valid",
-            "polarity_not_valid",
-            "analysis_number_not_valid",
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="output_type_not_valid",
+            polarity="positive",
+            analysis_number=1,
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers03(username, sqlite_with_atlas):
     with pytest.raises(
-        ValueError,
-        match='Parameter polarity is not one of: "positive", "negative", "fast-polarity-switching".',
+        traitlets.traitlets.TraitError,
+        match="Parameter polarity must be one of positive, negative, fast-polarity-switching",
     ):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-            "FinalEMA-HILIC",
-            "polarity_not_valid",
-            "analysis_number_not_valid",
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="FinalEMA-HILIC",
+            polarity="not a polarity value",
+            analysis_number=1,
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers04(username, sqlite_with_atlas):
-    with pytest.raises(TypeError, match="Parameter analysis_number is not an integer."):
+    with pytest.raises(
+        traitlets.traitlets.TraitError,
+        match="The 'analysis_number' trait of an AnalysisIdentifiers instance expected an int, not",
+    ):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-            "FinalEMA-HILIC",
-            "positive",
-            "analysis_number_not_valid",
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="FinalEMA-HILIC",
+            polarity="positive",
+            analysis_number="this is a string",
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers05(username, sqlite_with_atlas):
-    with pytest.raises(TypeError, match="Parameter analysis_number is not an integer."):
+    with pytest.raises(
+        traitlets.traitlets.TraitError,
+        match="The 'analysis_number' trait of an AnalysisIdentifiers instance expected an int, not",
+    ):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-            "FinalEMA-HILIC",
-            "positive",
-            "1",
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="FinalEMA-HILIC",
+            polarity="positive",
+            analysis_number="1",
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers06(username, sqlite_with_atlas):
-    with pytest.raises(ValueError, match="Parameter analysis_number cannot be negative."):
+    with pytest.raises(traitlets.traitlets.TraitError, match="Parameter analysis_number cannot be negative."):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-            "FinalEMA-HILIC",
-            "positive",
-            -9,
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            output_type="FinalEMA-HILIC",
+            polarity="positive",
+            analysis_number=-9,
+            project_directory="/foo/bar",
         )
 
 
 def test_analysis_identifiers07(username, sqlite_with_atlas):
-    with pytest.raises(ValueError, match='Parameter experiment does contain 9 fields when split on "_".'):
+    with pytest.raises(
+        traitlets.traitlets.TraitError, match='Parameter experiment does contain 9 fields when split on "_".'
+    ):
         mads.AnalysisIdentifiers(
-            f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-            "experiment_name_not_valid",
-            "output_type_not_valid",
-            "polarity_not_valid",
-            "analysis_number_not_valid",
-            "/foo/bar",
+            source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            experiment="experiment_name_not_valid",
+            output_type="FinalEMA-HILIC",
+            polarity="positive",
+            analysis_number=0,
+            project_directory="/foo/bar",
         )
 
 
@@ -531,7 +541,7 @@ def test_get_atlas01(mocker, analysis_ids, df_container, lcmsrun, atlas, usernam
     )
     mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
     mocker.patch("glob.glob", return_value=range(10))
-    metatlas_dataset = mads.MetatlasDataset(analysis_ids)
+    metatlas_dataset = mads.MetatlasDataset(ids=analysis_ids)
     assert metatlas_dataset.atlas.name == f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
 
 
@@ -539,14 +549,14 @@ def test_get_atlas02(mocker, analysis_ids, caplog):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[])
     caplog.set_level(logging.INFO)
     with pytest.raises(ValueError):
-        mads.MetatlasDataset(analysis_ids)
+        mads.MetatlasDataset(ids=analysis_ids)
     assert "Database does not contain an atlas" in caplog.text
 
 
 def test_get_atlas03(mocker, analysis_ids, caplog, username):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[0, 0])
     with pytest.raises(ValueError):
-        mads.MetatlasDataset(analysis_ids)
+        mads.MetatlasDataset(ids=analysis_ids)
     atlas = f"505892_OakGall_final_FinalEMA-HILIC_POS_{username}0"
     assert f"2 atlases with name {atlas} and owned by {username} already exist." in caplog.text
 

From 410779bb97851b3dceb4aff9468d343858bdf90e Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 5 Aug 2021 13:05:29 -0700
Subject: [PATCH 065/177] Fix jq version in testing image

---
 docker/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 6cd3e411..eb62ab11 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -8,8 +8,9 @@ ENV METATLAS_LOCAL=True
 
 EXPOSE 8888
 
-RUN apt-get update && apt-get install -y jq && \
-    rm -rf /var/lib/apt/lists/*
+ADD https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 /usr/local/bin/jq
+
+RUN chmod +x /usr/local/bin/jq
 
 COPY requirements.txt /requirements.txt
 

From 9a20bf4736c7e5ad25fde666a2d51a2a947717aa Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 5 Aug 2021 13:08:04 -0700
Subject: [PATCH 066/177] Improvements to building images

---
 .gitignore                                    |   1 +
 docker/build.sh                               |  73 +++++++
 docker/internal_extract.sh                    |  26 +++
 docker/mysql_to_sqlite_filtered.sh            |  18 ++
 docker/rt_predict_test_case_from_db.sql       | 184 ++++++++++++++++++
 ..._db.sql => targeted_test_case_from_db.sql} |   1 +
 6 files changed, 303 insertions(+)
 create mode 100755 docker/build.sh
 create mode 100755 docker/internal_extract.sh
 create mode 100755 docker/mysql_to_sqlite_filtered.sh
 create mode 100644 docker/rt_predict_test_case_from_db.sql
 rename docker/{extract_test_case_from_db.sql => targeted_test_case_from_db.sql} (99%)

diff --git a/.gitignore b/.gitignore
index 18e38d3a..a0036ecd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,7 @@ target/
 *.db
 *-shm
 *-wal
+dumps/
 
 # kbase installation directories
 bootstrap/
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 00000000..4007062b
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# catch some common errors, terminate if a command returns non-zero exit code
+set -euf -o pipefail
+
+SPIN_USER="$USER"
+PROJECT="metatlas_test"
+REGISTRY="registry.spin.nersc.gov"
+DOCKER="docker"
+
+IMAGE_NAME="metatlas_ci01"
+TAG=""
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -d|--docker) DOCKER="$2"; shift ;;
+    -i|--image) IMAGE_NAME="$2"; shift ;;
+    -r|--registry) REGISTRY="$2"; shift ;;
+    -p|--project) PROJECT="$2"; shift ;;
+    -t|--tag) TAG="$2"; shift ;;
+    -u|--user) SPIN_USER="$2"; shift ;;
+    -h|--help)
+        echo -e "$0 [options]"
+        echo ""
+        echo "   -h, --help              show this command reference"
+	echo "   -d, --docker            name of docker command (default ${DOCKER})"
+        echo "   -i, --image string      name of image to build (default ${IMAGE_NAME})"
+        echo "   -p, --project string    project name within the registry (default ${PROJECT})"
+        echo "   -r, --registry string   FQDN of container registry to push to"
+        echo "                           use 'NONE' to not push (default ${REGISTRY})"
+        echo "   -t, --tag string        image tag"
+        echo "   -u, --user string       username for ${REGISTRY} (default ${USER})"
+        exit 0
+        ;;
+    *)echo "Unknown parameter passed: $1"; exit 1 ;;
+  esac
+  shift
+done
+
+if [[ "$TAG" == "" ]]; then
+  >&2 echo "ERROR: no tag value given"
+  exit 9
+fi
+
+SHORT_TAG="${IMAGE_NAME}:${TAG}"
+LONG_TAG="${REGISTRY}/${PROJECT}/${SHORT_TAG}"
+
+DOCKERFILE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+if [[ ! -r "${DOCKERFILE_DIR}/Dockerfile" ]]; then
+  >&2 echo "ERROR: Could not find readable Dockerfile in ${DOCKERFILE_DIR}."
+  exit 1
+fi
+
+${DOCKER} image build --tag "${SHORT_TAG}" "${DOCKERFILE_DIR}"
+
+if [[ "$REGISTRY" != "NONE" ]]; then
+  if [[ $(uname -s) == "Darwin" ]]; then
+    # no readlink on macOS...
+    if [[ $(basename $(which ${DOCKER})) == 'podman' ]]; then
+      PUSH_FLAGS="--format=docker"
+    fi
+  else
+    if [[ $(basename $(readlink -f $(which ${DOCKER}))) == 'podman' ]]; then
+      PUSH_FLAGS="--format=docker"
+    fi
+  fi
+  ${DOCKER} image tag "${SHORT_TAG}" "${LONG_TAG}"
+  ${DOCKER} image push ${PUSH_FLAGS:-} "${LONG_TAG}"
+  TAG="${LONG_TAG}"
+else
+  TAG="${SHORT_TAG}"
+fi
diff --git a/docker/internal_extract.sh b/docker/internal_extract.sh
new file mode 100755
index 00000000..52e79b47
--- /dev/null
+++ b/docker/internal_extract.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -euf -o pipefail
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage $0 script_file mysql_password"
+    exit 1
+fi
+
+# install mysql-to-sqlite3
+apt-get update
+apt-get install -y python3 python3-pip
+pip3 install mysql-to-sqlite3
+
+# wait for mysql to be ready for connections
+while ! mysqladmin ping --silent; do
+    sleep 1
+done
+
+# reduce database contents using SQL script
+mysql "--password=$2" meta_atlas < "/script/$1"
+
+TIMESTAMP="$(date "+%Y-%m-%d-%H-%M")"
+
+# save reduced database to sqlite3 format
+mysql2sqlite -f "/docker-entrypoint-initdb.d/meta_atlas-${TIMESTAMP}.sqlite3" -d meta_atlas -u root --mysql-password "$2"
diff --git a/docker/mysql_to_sqlite_filtered.sh b/docker/mysql_to_sqlite_filtered.sh
new file mode 100755
index 00000000..c7568508
--- /dev/null
+++ b/docker/mysql_to_sqlite_filtered.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -euf -o pipefail
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage $0 script_file mysql_password"
+    exit 1
+fi
+
+# cat /global/cfs/cdirs/metatlas/mysql_user.txt
+
+TIMESTAMP="$(date "+%Y-%m-%d-%H-%M")"
+mkdir -p dumps
+docker run -it --rm -v "$(pwd)/dumps:/dumps" mysql:5.7 \
+	/usr/bin/mysqldump -h nerscdb04.nersc.gov -u meta_atlas_admin -p --all-databases "--result-file=/dumps/meta_atlas_all_dbs-${TIMESTAMP}.sql"
+MYSQL_ID="$(docker run -it --rm -e "MYSQL_ROOT_PASSWORD=$2"-v "$(pwd)/dumps:/docker-entrypoint-initdb.d" -v "$(pwd):/script" mysql:5.7)"
+docker exec -it "$MYSQL_ID" /script/internal_extract.sh "$1" "$2"
+docker stop "$MYSQL_ID"
diff --git a/docker/rt_predict_test_case_from_db.sql b/docker/rt_predict_test_case_from_db.sql
new file mode 100644
index 00000000..5882189a
--- /dev/null
+++ b/docker/rt_predict_test_case_from_db.sql
@@ -0,0 +1,184 @@
+-- ##### SETUP RT PREDICT TEST DATA ######
+
+/*
+This is run against a copy of the meta_atlas mysql database.
+Usually this is done by spinning up a mysql container
+and loading an all database dump:
+
+mysql password is on NERSC at
+/global/cfs/cdirs/metatlas/mysql_user.txt
+
+conda install -c ostrokach mysql-client=5.7.10
+mkdir -p dumps
+mysqldump -h nerscdb04.nersc.gov -u meta_atlas_admin -p --all-databases > dumps/meta_atlas.sql
+docker run -it --rm -e MYSQL_ROOT_PASSWORD=mypw -v $(pwd)/dumps:/docker-entrypoint-initdb.d -v $(pwd):/script mysql:5.7
+MYSQL_ID=$(docker ps | grep mysql | cut -f1 -d' ')
+docker exec -it $MYSQL_ID /bin/bash
+
+# then within the mysql container's shell:
+mysql --password=mypw meta_atlas < /script/extract_test_case_from_db.sql
+
+From within the atlasdb container
+apt-get update
+apt-get install -y python3 python3-pip
+pip3 install mysql-to-sqlite3
+mysql2sqlite -f /docker-entrypoint-initdb.d/meta_atlas.sqlite3 -d meta_atlas -p -u root
+exit
+
+
+*/
+
+SET GLOBAL sql_mode=(SELECT REPLACE(@@sql_mode,'ONLY_FULL_GROUP_BY',''));
+
+-- remove tables that are not used
+DROP TABLE IF EXISTS `Compounds`;
+DROP TABLE IF EXISTS `Group`;
+DROP TABLE IF EXISTS `group`;
+
+-- clean out tables we don't need pre-populated values in
+DELETE FROM groups;
+DELETE FROM groups_items;
+DELETE FROM methods;
+DELETE FROM samples;
+DELETE FROM mzintensitypairs;
+DELETE FROM identificationgrades;
+DELETE FROM functionalsets;
+DELETE FROM fragmentationreferences_mz_intensities;
+DELETE FROM compoundidentifications_frag_references;
+DELETE FROM fragmentationreferences;
+DELETE FROM lcmsruns;
+
+-- delete atlases that are not the most recent version of the atlases listed
+DELETE FROM atlases
+WHERE unique_id NOT IN (
+	SELECT error_fix.unique_id FROM (  -- extra SELECT required - https://stackoverflow.com/questions/45494/
+		SELECT first.unique_id
+		FROM atlases AS first
+		JOIN (
+			SELECT unique_id, max(last_modified) as last_modified
+			FROM atlases
+			WHERE name IN (
+				'HILICz150_ANT20190824_TPL_EMA_Unlab_POS',
+				'HILICz150_ANT20190824_TPL_QCv3_Unlab_POS',
+				'HILICz150_ANT20190824_TPL_ISv5_Unlab_POS',
+				'HILICz150_ANT20190824_TPL_ISv5_13C15N_POS',
+				'HILICz150_ANT20190824_TPL_IS_LabUnlab2_POS',
+				'HILICz150_ANT20190824_TPL_EMA_Unlab_NEG',
+				'HILICz150_ANT20190824_TPL_QCv3_Unlab_NEG',
+				'HILICz150_ANT20190824_TPL_ISv5_Unlab_NEG',
+				'HILICz150_ANT20190824_TPL_ISv5_13C15N_NEG',
+				'HILICz150_ANT20190824_TPL_IS_LabUnlab2_NEG') AND
+				username = 'vrsingan'
+			GROUP BY name
+		) AS newest
+		ON first.unique_id = newest.unique_id
+	) as error_fix
+);
+
+
+-- delete compounds not in the atlas list
+DELETE FROM compounds
+WHERE unique_id NOT IN (
+	SELECT error_fix.unique_id FROM (  -- extra SELECT required - https://stackoverflow.com/questions/45494/
+		SELECT c.unique_id
+		FROM atlases_compound_identifications AS aci
+		JOIN compoundidentifications AS ci
+		ON aci.target_id = ci.unique_id
+		JOIN compoundidentifications_compound AS cic
+		ON ci.unique_id = cic.source_id
+		JOIN compounds as c
+		ON cic.target_id = c.unique_id
+		WHERE aci.source_id IN (
+			SELECT first.unique_id
+			FROM atlases AS first
+			JOIN (
+				SELECT unique_id, max(last_modified) as last_modified
+				FROM atlases
+				WHERE name IN (
+					'HILICz150_ANT20190824_TPL_EMA_Unlab_POS',
+					'HILICz150_ANT20190824_TPL_QCv3_Unlab_POS',
+					'HILICz150_ANT20190824_TPL_ISv5_Unlab_POS',
+					'HILICz150_ANT20190824_TPL_ISv5_13C15N_POS',
+					'HILICz150_ANT20190824_TPL_IS_LabUnlab2_POS',
+					'HILICz150_ANT20190824_TPL_EMA_Unlab_NEG',
+					'HILICz150_ANT20190824_TPL_QCv3_Unlab_NEG',
+					'HILICz150_ANT20190824_TPL_ISv5_Unlab_NEG',
+					'HILICz150_ANT20190824_TPL_ISv5_13C15N_NEG',
+					'HILICz150_ANT20190824_TPL_IS_LabUnlab2_NEG') AND
+					username = 'vrsingan'
+				GROUP BY name
+			) AS newest
+			ON first.unique_id = newest.unique_id
+		)
+	) AS error_fix
+);
+
+-- work from compounds up to atlases_compound_identifications
+DELETE cic
+FROM compoundidentifications_compound AS cic
+LEFT JOIN compounds AS c
+ON cic.target_id=c.unique_id
+WHERE c.unique_id is null;
+
+DELETE ci
+FROM compoundidentifications AS ci
+LEFT JOIN compoundidentifications_compound AS cic
+ON ci.unique_id=cic.source_id
+WHERE cic.source_id is null;
+
+DELETE aci
+FROM atlases_compound_identifications AS aci
+LEFT JOIN compoundidentifications AS ci
+ON aci.target_id=ci.unique_id
+WHERE ci.unique_id is null;
+
+-- work from atlases_compound_identifications down to everything else
+DELETE atlases_compound_identifications
+FROM atlases_compound_identifications
+LEFT JOIN atlases
+ON atlases.unique_id=atlases_compound_identifications.source_id
+WHERE atlases.unique_id is null;
+
+DELETE compoundidentifications
+FROM compoundidentifications
+LEFT JOIN atlases_compound_identifications AS aci
+ON aci.target_id=compoundidentifications.unique_id
+WHERE aci.target_id is null;
+
+DELETE compoundidentifications_compound
+FROM compoundidentifications_compound
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_compound.head_id
+WHERE ci.unique_id is null;
+
+DELETE compoundidentifications_rt_references
+FROM compoundidentifications_rt_references
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_rt_references.head_id
+WHERE ci.unique_id is null;
+
+DELETE compoundidentifications_mz_references
+FROM compoundidentifications_mz_references
+LEFT JOIN compoundidentifications AS ci
+ON ci.unique_id=compoundidentifications_mz_references.head_id
+WHERE ci.unique_id is null;
+
+DELETE compounds
+FROM compounds
+LEFT JOIN compoundidentifications_compound AS cic
+ON compounds.head_id=cic.target_id
+WHERE cic.target_id is null;
+
+DELETE rtreferences
+FROM rtreferences
+LEFT JOIN compoundidentifications_rt_references AS cirr
+ON rtreferences.head_id=cirr.target_id
+WHERE cirr.target_id is null;
+
+DELETE mzreferences
+FROM mzreferences
+LEFT JOIN compoundidentifications_mz_references AS cimr
+ON mzreferences.head_id=cimr.target_id
+WHERE cimr.target_id is null;
+
+-- SELECT table_name, table_rows FROM information_schema.TABLES WHERE table_schema='meta_atlas' ORDER BY table_rows DESC;
diff --git a/docker/extract_test_case_from_db.sql b/docker/targeted_test_case_from_db.sql
similarity index 99%
rename from docker/extract_test_case_from_db.sql
rename to docker/targeted_test_case_from_db.sql
index 223168e2..76d8017d 100644
--- a/docker/extract_test_case_from_db.sql
+++ b/docker/targeted_test_case_from_db.sql
@@ -5,6 +5,7 @@ This is run against a copy of the meta_atlas mysql database.
 Usually this is done by spinning up a mysql container
 and loading an all database dump:
 
+conda install -c ostrokach mysql-client=5.7.10
 mkdir -p dumps
 mysqldump -h nerscdb04.nersc.gov -u meta_atlas_admin -p --all-databases > dumps/meta_atlas.sql
 docker run -it --rm -e MYSQL_ROOT_PASSWORD=mypw -v $(pwd)/dumps:/docker-entrypoint-initdb.d -v $(pwd):/script mysql:5.7

From ad7d9208a8d2d50977b1dfc2e2ad8118ea2d0218 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 6 Aug 2021 20:42:49 -0700
Subject: [PATCH 067/177] WIP - building RT prediction system test

---
 docker/{Dockerfile => Dockerfile.ci01}  |  2 +
 docker/Dockerfile.ci02                  | 27 +++++++++++++
 docker/build.sh                         | 25 +++++++++---
 docker/build_x.sh                       | 51 +++++++++++++++++++++++++
 docker/internal_extract.sh              |  2 +-
 docker/mysql_to_sqlite_filtered.sh      | 12 ++++--
 docker/rt_predict_test_case_from_db.sql | 18 ++++++++-
 7 files changed, 126 insertions(+), 11 deletions(-)
 rename docker/{Dockerfile => Dockerfile.ci01} (93%)
 create mode 100644 docker/Dockerfile.ci02
 create mode 100755 docker/build_x.sh

diff --git a/docker/Dockerfile b/docker/Dockerfile.ci01
similarity index 93%
rename from docker/Dockerfile
rename to docker/Dockerfile.ci01
index eb62ab11..d42d2450 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile.ci01
@@ -1,5 +1,7 @@
 FROM python:3.8-slim-buster
 
+# https://portal.nersc.gov/cfs/m2650/metatlas/test_data
+# serves from /global/cfs/cdirs/m2650/www/metatlas/test_data
 ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci01
 ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
 ARG H5_DIR=/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583
diff --git a/docker/Dockerfile.ci02 b/docker/Dockerfile.ci02
new file mode 100644
index 00000000..70c50e22
--- /dev/null
+++ b/docker/Dockerfile.ci02
@@ -0,0 +1,27 @@
+FROM python:3.8-slim-buster
+
+# https://portal.nersc.gov/cfs/m2650/metatlas/test_data
+# serves from /global/cfs/cdirs/m2650/www/metatlas/test_data
+ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci02
+ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
+ARG H5_DIR=/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583
+
+ENV METATLAS_LOCAL=True
+
+EXPOSE 8888
+
+RUN apt-get update && apt-get install -y libxrender1 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /io /src /work $REFS_DIR $H5_DIR
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5 $H5_DIR/
+
+COPY requirements.txt /requirements.txt
+RUN pip install --quiet -r requirements.txt
+
+ADD $BASE_DATA_URL/meta_atlas_rt_predict.sqlite3 /work/root_workspace.db
+
+WORKDIR /work
+
+CMD ["/usr/local/bin/jupyter", "nbclassic", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/docker/build.sh b/docker/build.sh
index 4007062b..1262cf93 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -7,13 +7,15 @@ SPIN_USER="$USER"
 PROJECT="metatlas_test"
 REGISTRY="registry.spin.nersc.gov"
 DOCKER="docker"
+DOCKERFILE=""
 
-IMAGE_NAME="metatlas_ci01"
+IMAGE_NAME=""
 TAG=""
 
 while [[ "$#" -gt 0 ]]; do
   case "$1" in
     -d|--docker) DOCKER="$2"; shift ;;
+    -f|--dockerfile) DOCKERFILE="$2"; shift ;;
     -i|--image) IMAGE_NAME="$2"; shift ;;
     -r|--registry) REGISTRY="$2"; shift ;;
     -p|--project) PROJECT="$2"; shift ;;
@@ -24,7 +26,8 @@ while [[ "$#" -gt 0 ]]; do
         echo ""
         echo "   -h, --help              show this command reference"
 	echo "   -d, --docker            name of docker command (default ${DOCKER})"
-        echo "   -i, --image string      name of image to build (default ${IMAGE_NAME})"
+	echo "   -f, --dockerfile        name of Dockerfile"
+        echo "   -i, --image string      name of image to build"
         echo "   -p, --project string    project name within the registry (default ${PROJECT})"
         echo "   -r, --registry string   FQDN of container registry to push to"
         echo "                           use 'NONE' to not push (default ${REGISTRY})"
@@ -37,6 +40,16 @@ while [[ "$#" -gt 0 ]]; do
   shift
 done
 
+if [[ "$IMAGE_NAME" == "" ]]; then
+  >&2 echo "ERROR: no Dockerfile value given"
+  exit 10
+fi
+
+if [[ "$DOCKERFILE" == "" ]]; then
+  >&2 echo "ERROR: no Dockerfile value given"
+  exit 10
+fi
+
 if [[ "$TAG" == "" ]]; then
   >&2 echo "ERROR: no tag value given"
   exit 9
@@ -45,14 +58,14 @@ fi
 SHORT_TAG="${IMAGE_NAME}:${TAG}"
 LONG_TAG="${REGISTRY}/${PROJECT}/${SHORT_TAG}"
 
-DOCKERFILE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-if [[ ! -r "${DOCKERFILE_DIR}/Dockerfile" ]]; then
-  >&2 echo "ERROR: Could not find readable Dockerfile in ${DOCKERFILE_DIR}."
+if [[ ! -r "${DOCKERFILE}" ]]; then
+  >&2 echo "ERROR: Could not find readable Dockerfile at ${DOCKERFILE}."
   exit 1
 fi
 
-${DOCKER} image build --tag "${SHORT_TAG}" "${DOCKERFILE_DIR}"
+${DOCKER} image build --tag "${SHORT_TAG}" --file "${DOCKERFILE}" "$SCRIPT_DIR"
 
 if [[ "$REGISTRY" != "NONE" ]]; then
   if [[ $(uname -s) == "Darwin" ]]; then
diff --git a/docker/build_x.sh b/docker/build_x.sh
new file mode 100755
index 00000000..6332c308
--- /dev/null
+++ b/docker/build_x.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -euf -o pipefail
+
+SPIN_USER="$USER"
+PROJECT="metatlas_test"
+REGISTRY="registry.spin.nersc.gov"
+DOCKER="docker"
+TAG=""
+ID=""
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -d|--docker) DOCKER="$2"; shift ;;
+    -i|--id) ID="$2"; shift ;;
+    -p|--project) PROJECT="$2"; shift ;;
+    -r|--registry) REGISTRY="$2"; shift ;;
+    -t|--tag) TAG="$2"; shift ;;
+    -u|--user) SPIN_USER="$2"; shift ;;
+    -h|--help)
+        echo -e "$0 [options]"
+        echo ""
+        echo "   -h, --help              show this command reference"
+	echo "   -d, --docker            name of docker command (default ${DOCKER})"
+	echo "   -i, --id                image id"
+        echo "   -p, --project string    project name within the registry (default ${PROJECT})"
+        echo "   -r, --registry string   FQDN of container registry to push to"
+        echo "                           use 'NONE' to not push (default ${REGISTRY})"
+        echo "   -t, --tag string        image tag"
+        echo "   -u, --user string       username for ${REGISTRY} (default ${USER})"
+        exit 0
+        ;;
+    *)echo "Unknown parameter passed: $1"; exit 1 ;;
+  esac
+  shift
+done
+
+if [[ "$TAG" == "" ]]; then
+  >&2 echo "ERROR: no tag value given"
+  exit 1
+fi
+
+if [[ "$ID" == "" ]]; then
+  >&2 echo "ERROR: no id value given"
+  exit 2
+fi
+
+DOCKERFILE="Dockerfile.$ID"
+IMAGE_NAME="metatlas_$ID"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+"${SCRIPT_DIR}/build.sh" --image "$IMAGE_NAME" --tag "$TAG" --docker "$DOCKER" --project "$PROJECT" --registry "$REGISTRY" --user "$USER" --dockerfile "${SCRIPT_DIR}/$DOCKERFILE"
diff --git a/docker/internal_extract.sh b/docker/internal_extract.sh
index 52e79b47..95ab6d8e 100755
--- a/docker/internal_extract.sh
+++ b/docker/internal_extract.sh
@@ -13,7 +13,7 @@ apt-get install -y python3 python3-pip
 pip3 install mysql-to-sqlite3
 
 # wait for mysql to be ready for connections
-while ! mysqladmin ping --silent; do
+while ! mysqladmin ping "--password=$2" --silent; do
     sleep 1
 done
 
diff --git a/docker/mysql_to_sqlite_filtered.sh b/docker/mysql_to_sqlite_filtered.sh
index c7568508..97d2c0fc 100755
--- a/docker/mysql_to_sqlite_filtered.sh
+++ b/docker/mysql_to_sqlite_filtered.sh
@@ -10,9 +10,15 @@ fi
 # cat /global/cfs/cdirs/metatlas/mysql_user.txt
 
 TIMESTAMP="$(date "+%Y-%m-%d-%H-%M")"
+DUMP_FILE="meta_atlas_all_dbs-${TIMESTAMP}.sql"
+ssh cori.nersc.gov shifter --image=docker:mysql/mysql-server:5.7.14 mysqldump \
+	-h nerscdb04.nersc.gov -u meta_atlas_admin --all-databases \
+	--set-gtid-purged=OFF "--result-file=$DUMP_FILE"
 mkdir -p dumps
-docker run -it --rm -v "$(pwd)/dumps:/dumps" mysql:5.7 \
-	/usr/bin/mysqldump -h nerscdb04.nersc.gov -u meta_atlas_admin -p --all-databases "--result-file=/dumps/meta_atlas_all_dbs-${TIMESTAMP}.sql"
-MYSQL_ID="$(docker run -it --rm -e "MYSQL_ROOT_PASSWORD=$2"-v "$(pwd)/dumps:/docker-entrypoint-initdb.d" -v "$(pwd):/script" mysql:5.7)"
+rm -rf dump/*
+scp "dtn01.nersc.gov:$DUMP_FILE" "dumps/$DUMP_FILE"
+MYSQL_ID="$(docker run --rm -e "MYSQL_ROOT_PASSWORD=$2" -v "$(pwd)/dumps:/docker-entrypoint-initdb.d" -v "$(pwd):/script" mysql:5.7)"
 docker exec -it "$MYSQL_ID" /script/internal_extract.sh "$1" "$2"
 docker stop "$MYSQL_ID"
+SQLITE=$(ls -lt1 "dumps/*.sqlite3" | head -1)
+scp -C "dumps/$SQLITE" dtn01.nersc.gov:/global/cfs/cdirs/m2650/www/metatlas/test_data/
diff --git a/docker/rt_predict_test_case_from_db.sql b/docker/rt_predict_test_case_from_db.sql
index 5882189a..fb4cc153 100644
--- a/docker/rt_predict_test_case_from_db.sql
+++ b/docker/rt_predict_test_case_from_db.sql
@@ -46,7 +46,23 @@ DELETE FROM functionalsets;
 DELETE FROM fragmentationreferences_mz_intensities;
 DELETE FROM compoundidentifications_frag_references;
 DELETE FROM fragmentationreferences;
-DELETE FROM lcmsruns;
+
+DELETE l
+FROM lcmsruns AS l
+LEFT JOIN (
+        SELECT unique_id
+        FROM lcmsruns AS l1
+        JOIN (
+                SELECT MAX(creation_time) AS ctime, hdf5_file
+                FROM lcmsruns
+                WHERE (name LIKE '20201106\_JGI-AK\_PS-KM\_505892\_OakGall\_final\_QE-HF\_HILICZ\_USHXG01583\_POS\_MSMS\_0\_QC\_P%')
+                GROUP BY hdf5_file
+        ) AS early
+        ON l1.creation_time=early.ctime AND l1.hdf5_file=early.hdf5_file
+) AS j
+ON l.unique_id=j.unique_id
+WHERE j.unique_id is NULL;
+
 
 -- delete atlases that are not the most recent version of the atlases listed
 DELETE FROM atlases

From 85546c9e3eb924e66c1f2fa537d76280ad5b0886 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 6 Aug 2021 20:44:50 -0700
Subject: [PATCH 068/177] fixes to data invalidation handling

---
 metatlas/datastructures/metatlas_dataset.py | 34 +++++++++----
 metatlas/tools/predict_rt.py                | 12 +++--
 tests/system/test_targeted.py               |  2 +-
 tests/unit/test_metatlas_dataset.py         | 56 +++++++++++++++++++--
 4 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 8b77821a..511a3506 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -33,7 +33,7 @@
 class AnalysisIdentifiers(HasTraits):
     """Names used in generating an analysis"""
 
-    source_atlas = Unicode(allow_none=True)
+    source_atlas = Unicode(allow_none=True, default_value=None)
     experiment = Unicode()
     output_type = Unicode()
     polarity = Unicode(default_value="positive")
@@ -238,7 +238,16 @@ def groups(self):
         self._groups = dp.filter_empty_metatlas_objects(out, "items")
         return self._groups
 
-    @observe("all_groups")
+    @observe("polarity")
+    def _observe_polarity(self, signal):
+        if signal.type == "change":
+            if signal.new == "positive":
+                self.exclude_groups.append("NEG")
+            elif signal.new == "negative":
+                self.exclude_groups.append("POS")
+            logger.debug("Change to polarity invalidates exclude_groups")
+
+    @observe("_all_groups")
     def _observe_all_groups(self, signal):
         if signal.type == "change":
             self._groups = None
@@ -268,6 +277,12 @@ def _observe_exclude_files(self, signal):
             self._lcmsruns = None
             logger.debug("Change to exclude_files invalidates lcmsruns")
 
+    @observe("_lcmsruns")
+    def _observe_lcmsruns(self, signal):
+        if signal.type == "change":
+            self._all_groups = None
+            logger.debug("Change to lcmsruns invalidates all_groups")
+
     @property
     def existing_groups(self):
         """Get your own groups that are prefixed by self.experiment"""
@@ -300,8 +315,8 @@ def all_groups(self):
         """Returns a list of Group objects"""
         if self._all_groups is not None:
             return self._all_groups
-        self._all_groups = []
         unique_groups = self.all_groups_dataframe[["group", "short_name"]].drop_duplicates()
+        self._all_groups = []
         for values in unique_groups.to_dict("index").values():
             self._all_groups.append(
                 metob.Group(
@@ -638,6 +653,12 @@ def _observe_atlas(self, signal):
             self._data = None
             logger.debug("Change to atlas invalidates atlas_df, data")
 
+    @observe("_atlas_df")
+    def _observe_atlas_df(self, signal):
+        if signal.type == "change":
+            self._data = None
+            logger.debug("Change to atlas_df invalidates data")
+
     @property
     def polarity(self):
         """
@@ -708,13 +729,6 @@ def __len__(self):
         """len is from data"""
         return len(self.data)
 
-    def set_data(self, ids, value):
-        """update a value within self._data"""
-        if self._data is None:
-            self._build()
-        self._atlas_df = None
-        _set_nested(self._data, ids, value)
-
     @property
     def rts(self):
         """
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 9272bcc8..07ac86cb 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -112,7 +112,7 @@ def generate_rt_correction_models(
                         are pre-populated into the generated notebooks
     """
     # pylint: disable=too-many-locals
-    metatlas_dataset = mads.MetatlasDataset(ids, save_metadata=False)
+    metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
     groups = get_groups(metatlas_dataset, ids.include_groups)
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
@@ -144,7 +144,7 @@ def get_groups(metatlas_dataset, include_groups):
         include_groups: group will only be used in correction if their name has a substring match
                         to this list of strings
     """
-    metatlas_dataset.store_groups(exist_ok=True)
+    metatlas_dataset.ids.store_all_groups(exist_ok=True)
     ids = metatlas_dataset.ids
     groups = dp.select_groups_for_analysis(
         name=f"{ids.experiment}_{ids.short_polarity}_%{ids.analysis}_%",
@@ -498,5 +498,11 @@ def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_n
         polarity: defaults to 'positive', set to 'negative' if you only have neg mode data
     Returns an AnalysisIds instance
     """
-    ids = mads.AnalysisIdentifiers(None, experiment, "data_QC", polarity, analysis_number, project_directory)
+    ids = mads.AnalysisIdentifiers(
+        experiment=experiment,
+        output_type="data_QC",
+        polarity=polarity,
+        analysis_number=analysis_number,
+        project_directory=project_directory,
+    )
     return ids
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index d95f5199..cc9045a2 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -5,7 +5,7 @@
 
 
 def test_targeted_by_line01_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.2.0"
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.0"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     out_files = {}
     expected = {}
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index b61c2e1b..3bbdf413 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -4,6 +4,7 @@
 import datetime
 import glob
 import logging
+import os
 
 import pandas as pd
 import pytest
@@ -343,11 +344,6 @@ def test_set_msms_refs_loc_setter(metatlas_dataset, mocker, hits):
     assert metatlas_dataset.msms_refs_loc == "/tmp/some_file.tab"
 
 
-def test_set_data01(metatlas_dataset):
-    metatlas_dataset.set_data([0, 0, "identification", "ms2_notes"], "extact match")
-    assert metatlas_dataset[0][0]["identification"].ms2_notes == "extact match"
-
-
 def test_store_atlas01(atlas, sqlite, username):
     atlas.name = "test_store_atlas01"
     atlas_list = metob.retrieve("Atlas", name=atlas.name, username=username)
@@ -644,3 +640,53 @@ def test_load_atlas02(atlas, sqlite_with_atlas, username):
 def test_load_atlas03(sqlite_with_atlas, atlas, username):
     results = metob.retrieve("Atlas", name=atlas.name, username=username)
     assert results[0].compound_identifications[0].rt_references[0].rt_peak == 2.1964640053707174
+
+
+def test_invalidation01(analysis_ids):
+    _ = analysis_ids.groups
+    assert analysis_ids._groups is not None
+    analysis_ids.exclude_files = ['Cone-S1']
+    assert analysis_ids._groups is None
+
+
+def test_negative_polarity01(sqlite_with_atlas, username, lcmsrun, mocker, groups_controlled_vocab):
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    ids = mads.AnalysisIdentifiers(
+        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        output_type="FinalEMA-HILIC",
+        polarity="negative",
+        analysis_number=0,
+        project_directory=str(os.getcwd()),
+        groups_controlled_vocab=groups_controlled_vocab,
+    )
+    assert 'POS' in ids.exclude_groups
+
+
+def test_include_groups01(sqlite_with_atlas, username, lcmsrun, mocker, groups_controlled_vocab):
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=[lcmsrun])
+    ids = mads.AnalysisIdentifiers(
+        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        output_type="data_QC",
+        polarity="negative",
+        analysis_number=0,
+        project_directory=str(os.getcwd()),
+        groups_controlled_vocab=groups_controlled_vocab,
+    )
+    assert 'QC' in ids.include_groups
+
+
+def test_project01(analysis_ids):
+    assert analysis_ids.project == '505892'
+
+
+def test_exclude_files01(analysis_ids):
+    analysis_ids.exclude_files = ['POS']
+    assert len(analysis_ids.lcmsruns) == 0
+    assert analysis_ids.lcmsruns_short_names.empty
+
+
+def test_invlidate_groups_controlled_vocab01(analysis_ids):
+    _ = analysis_ids.lcmsruns
+    assert analysis_ids._lcmsruns is not None
+    analysis_ids.groups_controlled_vocab = ['FOOBAR']
+    assert analysis_ids._lcmsruns is None

From df2ac73023d536ecc23aa535465e979646f43e13 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 6 Aug 2021 20:53:25 -0700
Subject: [PATCH 069/177] permissions change to script

---
 papermill/launch_rt_prediction.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 papermill/launch_rt_prediction.sh

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
old mode 100644
new mode 100755

From 601faaea538d5c77973a0e745765b9466da6fd6a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Sat, 7 Aug 2021 23:18:24 -0700
Subject: [PATCH 070/177] fix missing path in launch_rt_predictions.sh

---
 papermill/launch_rt_prediction.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index 0eb1dc60..cd127e98 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -19,7 +19,7 @@ PROPOSAL="${TOKENS[0]}"
 mkdir -p "$EXP_DIR"
 
 export IN_FILE="${REPO_DIR}/notebooks/reference/RT_Prediction.ipynb"
-export OUT_FILE="$/503256_RT_Prediction_papermill_12.ipynb"
+export OUT_FILE="$(pwd)/503256_RT_Prediction_papermill_12.ipynb"
 export PARAMETERS="-p experiment $EXP -p metatlas_repo_path $REPO_DIR -p project_directory $PROJECT_DIR -p max_cpus 32 -p analysis_number $ANALYSIS_NUM"
 
 sbatch -J "${PROPOSAL}_RT_Pred" "${REPO_DIR}/papermill/slurm_template.sh"

From 402e68da54c11f471df15dbed6079b775e172b3d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Sun, 8 Aug 2021 21:19:27 -0700
Subject: [PATCH 071/177] fix papermill output directory

---
 papermill/launch_rt_prediction.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index cd127e98..95b35ee5 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -1,25 +1,26 @@
 #!/bin/bash
-
 set -euf -o pipefail
 
 if [ "$#" -ne 3 ]; then
     echo "Usage $: experiment_name analysis_number project_directory"
+    exit 0
 fi
 
 EXP="$1"
 ANALYSIS_NUM="$2"
 PROJECT_DIR="$3"
+
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR="$(dirname "$SCRIPT_DIR")"
 EXP_DIR="${PROJECT_DIR}/$EXP"
+ANALYSIS_DIR="${EXP_DIR}/${USER}${ANALYSIS_NUM}"
 
 IFS='_' read -ra TOKENS <<< "$EXP"
 PROPOSAL="${TOKENS[0]}"
 
-mkdir -p "$EXP_DIR"
-
 export IN_FILE="${REPO_DIR}/notebooks/reference/RT_Prediction.ipynb"
-export OUT_FILE="$(pwd)/503256_RT_Prediction_papermill_12.ipynb"
+export OUT_FILE="${ANALYSIS_DIR}/${PROPOSAL}_RT_Prediction_papermill.ipynb"
 export PARAMETERS="-p experiment $EXP -p metatlas_repo_path $REPO_DIR -p project_directory $PROJECT_DIR -p max_cpus 32 -p analysis_number $ANALYSIS_NUM"
 
+mkdir -p "$ANALYSIS_DIR"
 sbatch -J "${PROPOSAL}_RT_Pred" "${REPO_DIR}/papermill/slurm_template.sh"

From 3efd072dca068c7e79ea9251ba7413e070b021e5 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 10 Aug 2021 11:38:26 -0700
Subject: [PATCH 072/177] Notebook error handling improvments and formating

---
 notebooks/reference/RT_Prediction.ipynb | 17 ++++++++---------
 notebooks/reference/Targeted.ipynb      | 14 +++++++-------
 noxfile.py                              |  9 +++++----
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 9383dc26..61447942 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -93,16 +93,15 @@
     "    )\n",
     "    raise ValueError(\"Invalid metatlas_repo_path parameter in Jupyter Notebook.\")\n",
     "try:\n",
-    "    from metatlas.tools import notebook  # noqa: E402\n",
-    "    from metatlas.tools import predict_rt  # noqa: E402\n",
+    "    from metatlas.tools import notebook, predict_rt  # noqa: E402\n",
     "except ModuleNotFoundError as err:\n",
-    "    logging.critical(\n",
-    "        (\n",
-    "            \"Could not find metatlas module at %s. \"\n",
-    "            \"In the Parameters block, please check the value of metatlas_repo_path.\"\n",
-    "        ),\n",
-    "        metatlas_repo_path,\n",
-    "    )\n",
+    "    if str(err) == \"No module named 'metatlas.tools'\":\n",
+    "        logging.critical(\n",
+    "            (\"Could not find metatlas module at %s. \" \"In the Parameters block, please check the value of metatlas_repo_path.\"),\n",
+    "            metatlas_repo_path,\n",
+    "        )\n",
+    "    else:\n",
+    "        logger.critical('Please check that the kernel is set to \"Metatlas Targeted\".')\n",
     "    raise ModuleNotFoundError from err\n",
     "notebook.setup(log_level)"
    ]
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index d567aa03..2929e227 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -131,13 +131,13 @@
     "\n",
     "    environment.validate_kernel()\n",
     "except ModuleNotFoundError as err:\n",
-    "    logging.critical(\n",
-    "        (\n",
-    "            \"Could not find metatlas repo at %s. \"\n",
-    "            \"In the Parameters block, please check the value of metatlas_repo_path.\"\n",
-    "        ),\n",
-    "        metatlas_repo_path,\n",
-    "    )\n",
+    "    if str(err) == \"No module named 'metatlas.tools'\":\n",
+    "        logging.critical(\n",
+    "            (\"Could not find metatlas module at %s. \" \"In the Parameters block, please check the value of metatlas_repo_path.\"),\n",
+    "            metatlas_repo_path,\n",
+    "        )\n",
+    "    else:\n",
+    "        logger.critical('Please check that the kernel is set to \"Metatlas Targeted\".')\n",
     "    raise ModuleNotFoundError from err\n",
     "except ImportError as err:\n",
     "    logging.critical(\"A newer version of metatlas_repo is required to use this notebook.\")\n",
diff --git a/noxfile.py b/noxfile.py
index 9eef89ce..019624f3 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -21,6 +21,7 @@
 # has not yet been updated to pass all checks.
 more_checks = [
     "metatlas/io/targeted_output.py",
+    "metatlas/io/rclone.py",
     "metatlas/io/write_utils.py",
     "metatlas/datastructures/metatlas_dataset.py",
     "metatlas/tools/environment.py",
@@ -76,7 +77,7 @@
 
 nox.options.error_on_external_run = True
 REUSE_LARGE_VENV = True
-
+NB_LINE_LEN = 140
 
 @nox.session(python=py_versions[0])
 def flake8_diff(session):
@@ -127,7 +128,7 @@ def pylint_nb(session):
     # dupliate code cannot be disabled on per-cell level https://github.com/PyCQA/pylint/issues/214
     # Some duplicate code is required to setup the notebook and do error handling.
     # So turn off duplicate code for whole session -- not ideal.
-    session.run("nbqa", "pylint", "--disable=duplicate-code", *notebooks)
+    session.run("nbqa", "pylint", "--disable=duplicate-code",  f"--max-line-length={NB_LINE_LEN}", *notebooks)
 
 
 @nox.session(python=py_versions[0])
@@ -139,14 +140,14 @@ def flake8_nb(session):
 @nox.session(python=py_versions[0])
 def black_nb(session):
     session.install("black", *nbqa_deps)
-    session.run("nbqa", "black", "--check", *notebooks)
+    session.run("nbqa", "black", f"--line-length={NB_LINE_LEN}", "--check", *notebooks)
 
 
 @nox.session(python=py_versions[0])
 def blacken_nb(session):
     """this modifies notebook files to meet black's requirements"""
     session.install("black", *nbqa_deps)
-    session.run("nbqa", "black", "--nbqa-mutate", *notebooks)
+    session.run("nbqa", "black", f"--line-length={NB_LINE_LEN}", "--nbqa-mutate", *notebooks)
 
 
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)

From 9d1b439ae8101ccefb7ee7808623733b65c4464f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 10 Aug 2021 11:40:01 -0700
Subject: [PATCH 073/177] ignore slurm log files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index a0036ecd..8f2972ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -95,3 +95,6 @@ scratch/
 
 # pyenv
 .python-version
+
+# slurm
+slurm-*.out

From 8eb212889026e19b2f34847304a32c5440fc665c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 10 Aug 2021 13:29:29 -0700
Subject: [PATCH 074/177] update RT sliders on 'm' key press event

---
 metatlas/plots/dill2plots.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 0647f70d..916b292a 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -676,7 +676,9 @@ def match_rts(self):
         """Sets RT min and max to match similar compound referenced by match_idx"""
         source = self.similar_compounds[self.match_idx]['rt']
         self.update_rt('rt_min', source.rt_min)
+        self.rt_min_slider.set_val(source.rt_min)
         self.update_rt('rt_max', source.rt_max)
+        self.rt_max_slider.set_val(source.rt_max)
 
     def update_y_scale(self, val):
         if self.slider_y_min < 0:

From 16e09617c42df5c37c4d035e96a9236886b5d492 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 10 Aug 2021 14:31:50 -0700
Subject: [PATCH 075/177] Add copying of output to Google Drive via rclone

---
 metatlas/datastructures/metatlas_dataset.py | 14 ++---
 metatlas/io/rclone.py                       | 58 +++++++++++++++++++++
 metatlas/io/targeted_output.py              | 48 ++++++++++++++++-
 metatlas/tools/predict_rt.py                | 42 ++++++++-------
 tests/unit/test_metatlas_dataset.py         | 12 ++---
 tests/unit/test_predict_rt.py               |  2 +
 6 files changed, 140 insertions(+), 36 deletions(-)
 create mode 100644 metatlas/io/rclone.py
 create mode 100644 tests/unit/test_predict_rt.py

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 511a3506..aef79dcf 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -5,7 +5,6 @@
 import logging
 import os
 import shutil
-import tarfile
 
 import humanize
 import pandas as pd
@@ -471,7 +470,7 @@ def _build(self):
                         self.extra_mz,
                     )
                 )
-        logger.info("Reading MSMS data from h5 files")
+        logger.info("Generating MetatlasDataset by reading MSMS data from h5 files")
         samples = parallel.parallel_process(
             ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample", spread_args=False
         )
@@ -706,6 +705,8 @@ def _observe_msms_refs_loc(self, signal):
     @property
     def hits(self):
         """get msms hits DataFrame"""
+        _ = self.atlas_df  # regenerate if needed before logging hits generation
+        _ = self.data  # regenerate if needed before logging hits generation
         if self._hits is None:
             logger.info(
                 "Generating hits with extra_time=%.3f, frag_mz_tolerance=%.4f, msms_refs_loc=%s.",
@@ -827,13 +828,8 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         if msms_fragment_ions:
             targeted_output.write_msms_fragment_ions(self, overwrite)
         logger.info("Generation of output files completed sucessfully.")
-        logger.info("Generating archive of output files.")
-        output_path = os.path.join(
-            self.ids.project_directory, self.ids.experiment, f"{self.ids.short_experiment_analysis}.tar.gz"
-        )
-        with tarfile.open(output_path, "w:gz") as tar:
-            tar.add(self.ids.output_dir, arcname=os.path.basename(self.ids.output_dir))
-        logger.info("Generation of archive completed succesfully: %s", output_path)
+        targeted_output.archive_outputs(self.ids)
+        targeted_output.copy_outputs_to_google_drive(self.ids)
 
 
 class MetatlasSample:
diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
new file mode 100644
index 00000000..7707a313
--- /dev/null
+++ b/metatlas/io/rclone.py
@@ -0,0 +1,58 @@
+""" copy files to Google Drive using rclone """
+
+import configparser
+import logging
+import subprocess
+
+
+logger = logging.getLogger(__name__)
+
+
+class RClone:
+    """Access to Google Drive"""
+
+    def __init__(self, rclone_path):
+        self.rclone_path = rclone_path
+
+    def config_file(self):
+        """Returns path to config file or None"""
+        try:
+            result = subprocess.check_output(f"{self.rclone_path} config file", text=True)
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return None
+        return result.split('\n')[1]
+
+    def get_name_for_id(self, identifier):
+        """
+        Inputs:
+            identifer: unique folder identifier from Google Drive URL
+        if identifier is in the config file, then return the name assigned to the identifier
+        otherwise return None
+        """
+        ini_file = self.config_file()
+        if ini_file is None:
+            return None
+        config = configparser.ConfigParser()
+        config.read(ini_file)
+        for name in config.sections():
+            props = config[name]
+            if "type" in props and props["type"] == "drive":
+                if "root_folder_id" in props and props["root_folder_id"] == identifier:
+                    return name
+        return None
+
+    def copy_to_drive(self, source, drive, dest_path=None):
+        """
+        Inputs:
+            source: file or directory to copy to drive
+            drive: name in the RClone configuration for a location in Google Drive
+            dest_path: location under drive to copy to, will create folders if needed
+        """
+        dest = f"{drive}:" if dest_path is None else f"{drive}:{dest_path}"
+        try:
+            subprocess.check_output(f"{self.rclone_path} copy {source} {dest}", text=True)
+        except subprocess.CalledProcessError as err:
+            logger.exception(err)
+            raise err
+        except FileNotFoundError:
+            logger.info('rclone not found. Skipping transfer to Google Drive')
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 99fea613..2d87714e 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -3,18 +3,22 @@
 
 import logging
 import os
+import tarfile
 
 from collections import namedtuple
 
 import numpy as np
 import pandas as pd
 
+from metatlas.io import rclone
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 from metatlas.tools import fastanalysis as fa
 
 logger = logging.getLogger(__name__)
 
+RCLONE_PATH = "/global/cfs/cdirs/m342/USA/shared-repos/rclone/bin/rclone"
+
 
 def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
     """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
@@ -162,7 +166,14 @@ def write_metrics_and_boxplots(metatlas_dataset, overwrite=False, max_cpus=1):
                 metatlas_dataset.ids.output_dir,
                 f"{prefix}boxplot_{fields['name']}",
             )
-            dp.make_boxplot_plots(dataframe, plot_dir, fields["label"], overwrite, max_cpus)
+            dp.make_boxplot_plots(
+                dataframe,
+                output_loc=plot_dir,
+                use_shortnames=True,
+                ylabel=fields["label"],
+                overwrite=overwrite,
+                max_cpus=max_cpus,
+            )
 
 
 Max = namedtuple("Max", ["file_idx", "pre_intensity_idx", "pre_intensity", "precursor_mz"])
@@ -276,3 +287,38 @@ def get_spectra(data, max_pre_intensity, min_mz, max_mz, intensity_fraction, sca
                 intensity = (intensity / max_msms_intensity * scale_intensity).astype(int)
             return msms_mz[keep_idx], intensity[keep_idx]
     return None, None
+
+
+def archive_outputs(ids):
+    """
+    Creates a .tar.gz file containing all output files
+    Inputs:
+        ids: an AnalysisIds object
+    """
+    logger.info("Generating archive of output files.")
+    output_file = f"{ids.short_experiment_analysis}.tar.gz"
+    output_path = os.path.join(ids.project_directory, ids.experiment, output_file)
+    with tarfile.open(output_path, "w:gz") as tar:
+        tar.add(ids.output_dir, arcname=os.path.basename(ids.output_dir))
+    logger.info("Generation of archive completed succesfully: %s", output_path)
+
+
+def copy_outputs_to_google_drive(ids):
+    """
+    Recursively copy the output files to Google Drive using rclone
+    Inputs:
+        ids: an AnalysisIds object
+    """
+    logger.info("Coping output files to Google Drive")
+    rci = rclone.RClone(RCLONE_PATH)
+    fail_suffix = "not copying files to Google Drive"
+    if rci.config_file() is None:
+        logger.warning("RClone config file not found -- %s.", fail_suffix)
+        return
+    drive = rci.get_name_for_id(ids.google_folder)
+    if drive is None:
+        logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
+        return
+    sub_folder = os.path.join(ids.experiment, ids.analysis, ids.output_type)
+    rci.copy_to_drive(ids.output_dir, drive, sub_folder)
+    logger.info("Done copying output files to Google Drive")
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 07ac86cb..6f9e02ab 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -21,6 +21,7 @@
 from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.io import targeted_output
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 from metatlas.tools import notebook
@@ -99,11 +100,6 @@ def generate_rt_correction_models(
     Generate the RT correction models and associated atlases with adjusted RT values
     inputs:
         ids: an AnalysisIds object matching the one used in the main notebook
-        groups_controlled_vocab: list of strings that will group together when creating groups
-                                 application of groups_controlled_vocab is case insensitive
-        exclude_files: list of strings that will exclude files if they are substrings of the file name
-        include_groups: group will only be used in correction if their name has a substring match
-                        to this list of strings
         cpus: max number of cpus to use
         repo_dir: location of metatlas git repo on local filesystem
         save_to_db: If True, save the new atlases to the database
@@ -113,7 +109,7 @@ def generate_rt_correction_models(
     """
     # pylint: disable=too-many-locals
     metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
-    groups = get_groups(metatlas_dataset, ids.include_groups)
+    groups = get_groups(metatlas_dataset)
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
@@ -133,10 +129,12 @@ def generate_rt_correction_models(
     write_models(models_file_name, linear, poly, groups, qc_atlas)
     atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
     write_notebooks(ids, atlases, repo_dir, use_poly_model)
+    targeted_output.copy_outputs_to_google_drive(ids)
+    targeted_output.archive_outputs(ids)
     logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")
 
 
-def get_groups(metatlas_dataset, include_groups):
+def get_groups(metatlas_dataset):
     """
     Create all experiment groups if they don't already exist and return the subset matching include_list
     inputs:
@@ -144,18 +142,8 @@ def get_groups(metatlas_dataset, include_groups):
         include_groups: group will only be used in correction if their name has a substring match
                         to this list of strings
     """
-    metatlas_dataset.ids.store_all_groups(exist_ok=True)
-    ids = metatlas_dataset.ids
-    groups = dp.select_groups_for_analysis(
-        name=f"{ids.experiment}_{ids.short_polarity}_%{ids.analysis}_%",
-        most_recent=True,
-        remove_empty=True,
-        include_list=include_groups,
-        exclude_list=ids.short_polarity_inverse,
-        do_print=False,
-    )
-    ordered_groups = sorted(groups, key=lambda x: x.name)
-    _ = [logger.info("Selected group: %s, %s", grp.name, grp.last_modified) for grp in groups]
+    ordered_groups = sorted(metatlas_dataset.ids.groups, key=lambda x: x.name)
+    _ = [logger.info("Selected group: %s, %s", grp.name, grp.last_modified) for grp in ordered_groups]
     return ordered_groups
 
 
@@ -216,7 +204,21 @@ def get_rts(metatlas_dataset, include_atlas_rt_peak=True):
         rts_df["atlas RT peak"] = [
             compound["identification"].rt_references[0].rt_peak for compound in metatlas_dataset[0]
         ]
-    return rts_df
+    return order_df_columns_by_run(rts_df)
+
+
+def order_df_columns_by_run(dataframe):
+    """
+    Returns a dataframe with re-ordered columns such that second column up to column 'mean'
+    are ordered by run number from low to high
+    """
+    cols = dataframe.columns.tolist()
+    stats_idx = cols.index("mean")
+    to_sort = cols[1:stats_idx]
+    no_sort = cols[stats_idx:]
+    to_sort.sort(key=lambda x: int(x.split(".")[0].split("_Run")[1]))
+    new_cols = [cols[0]] + to_sort + no_sort
+    return dataframe[new_cols]
 
 
 def plot_compound_atlas_rts(num_files, rts_df, file_name, fontsize=2, pad=0.1, cols=8):
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 3bbdf413..1fb9eb64 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -645,7 +645,7 @@ def test_load_atlas03(sqlite_with_atlas, atlas, username):
 def test_invalidation01(analysis_ids):
     _ = analysis_ids.groups
     assert analysis_ids._groups is not None
-    analysis_ids.exclude_files = ['Cone-S1']
+    analysis_ids.exclude_files = ["Cone-S1"]
     assert analysis_ids._groups is None
 
 
@@ -659,7 +659,7 @@ def test_negative_polarity01(sqlite_with_atlas, username, lcmsrun, mocker, group
         project_directory=str(os.getcwd()),
         groups_controlled_vocab=groups_controlled_vocab,
     )
-    assert 'POS' in ids.exclude_groups
+    assert "POS" in ids.exclude_groups
 
 
 def test_include_groups01(sqlite_with_atlas, username, lcmsrun, mocker, groups_controlled_vocab):
@@ -672,15 +672,15 @@ def test_include_groups01(sqlite_with_atlas, username, lcmsrun, mocker, groups_c
         project_directory=str(os.getcwd()),
         groups_controlled_vocab=groups_controlled_vocab,
     )
-    assert 'QC' in ids.include_groups
+    assert "QC" in ids.include_groups
 
 
 def test_project01(analysis_ids):
-    assert analysis_ids.project == '505892'
+    assert analysis_ids.project == "505892"
 
 
 def test_exclude_files01(analysis_ids):
-    analysis_ids.exclude_files = ['POS']
+    analysis_ids.exclude_files = ["POS"]
     assert len(analysis_ids.lcmsruns) == 0
     assert analysis_ids.lcmsruns_short_names.empty
 
@@ -688,5 +688,5 @@ def test_exclude_files01(analysis_ids):
 def test_invlidate_groups_controlled_vocab01(analysis_ids):
     _ = analysis_ids.lcmsruns
     assert analysis_ids._lcmsruns is not None
-    analysis_ids.groups_controlled_vocab = ['FOOBAR']
+    analysis_ids.groups_controlled_vocab = ["FOOBAR"]
     assert analysis_ids._lcmsruns is None
diff --git a/tests/unit/test_predict_rt.py b/tests/unit/test_predict_rt.py
new file mode 100644
index 00000000..2a3affe1
--- /dev/null
+++ b/tests/unit/test_predict_rt.py
@@ -0,0 +1,2 @@
+""" unit testing of predict_rt functions """
+# pylint: disable=missing-function-docstring

From 94e0c9c773ccc2935308954b58c5fde996fba9dc Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 10 Aug 2021 16:18:09 -0700
Subject: [PATCH 076/177] allow rt_predict to skip atlas generation

---
 metatlas/tools/predict_rt.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 6f9e02ab..fc8aa333 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -95,6 +95,7 @@ def generate_rt_correction_models(
     repo_dir,
     save_to_db=True,
     use_poly_model=True,
+    model_only=False
 ):
     """
     Generate the RT correction models and associated atlases with adjusted RT values
@@ -106,6 +107,7 @@ def generate_rt_correction_models(
         use_poly_model: If True, use the polynomial model, else use linear model
                         Both types of models are always generated, this only determines which ones
                         are pre-populated into the generated notebooks
+        model_only: If True, do not create atlases or notebooks, if False create them
     """
     # pylint: disable=too-many-locals
     metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
@@ -127,8 +129,9 @@ def generate_rt_correction_models(
     save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, rt_comparison_file_name)
     models_file_name = os.path.join(ids.output_dir, "rt_model.txt")
     write_models(models_file_name, linear, poly, groups, qc_atlas)
-    atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
-    write_notebooks(ids, atlases, repo_dir, use_poly_model)
+    if not model_only:
+        atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
+        write_notebooks(ids, atlases, repo_dir, use_poly_model)
     targeted_output.copy_outputs_to_google_drive(ids)
     targeted_output.archive_outputs(ids)
     logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")

From 23f9cce38f1aa4a534d4aa596de26fc15fdafba0 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 11 Aug 2021 08:56:28 -0700
Subject: [PATCH 077/177] In RT predict include_groups 'QC' -> '_QC_'

---
 notebooks/reference/RT_Prediction.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 61447942..812da8eb 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -30,7 +30,7 @@
     "experiment = \"REPLACE ME\"\n",
     "\n",
     "# group will only be used in RT prediction if their name has a substring match to this list of strings\n",
-    "include_groups = [\"QC\"]\n",
+    "include_groups = [\"_QC_\"]\n",
     "\n",
     "# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']\n",
     "exclude_files = []\n",

From 1ce95abda42665f24ccf408df810a4530d684c96 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 11 Aug 2021 09:29:30 -0700
Subject: [PATCH 078/177] Fix outputs of RT Prediction

---
 metatlas/tools/predict_rt.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index fc8aa333..f1623077 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -216,11 +216,11 @@ def order_df_columns_by_run(dataframe):
     are ordered by run number from low to high
     """
     cols = dataframe.columns.tolist()
-    stats_idx = cols.index("mean")
-    to_sort = cols[1:stats_idx]
-    no_sort = cols[stats_idx:]
+    stats_start_idx = cols.index("mean")
+    to_sort = cols[:stats_start_idx]
+    no_sort = cols[stats_start_idx:]
     to_sort.sort(key=lambda x: int(x.split(".")[0].split("_Run")[1]))
-    new_cols = [cols[0]] + to_sort + no_sort
+    new_cols = to_sort + no_sort
     return dataframe[new_cols]
 
 

From 7fb9dfafff1d0da4abd0e8fa96fa4da827b21ab8 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 11 Aug 2021 11:13:39 -0700
Subject: [PATCH 079/177] Updates to making ci02 container

---
 docker/Dockerfile.ci02                  |  9 ++++++++-
 docker/internal_extract.sh              | 19 +++++++++++++++++--
 docker/mysql_to_sqlite_filtered.sh      | 21 +++++++++++++++------
 docker/rt_predict_test_case_from_db.sql |  2 +-
 4 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.ci02 b/docker/Dockerfile.ci02
index 70c50e22..387b4a08 100644
--- a/docker/Dockerfile.ci02
+++ b/docker/Dockerfile.ci02
@@ -13,9 +13,16 @@ EXPOSE 8888
 RUN apt-get update && apt-get install -y libxrender1 && \
     rm -rf /var/lib/apt/lists/*
 
+ ADD https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 /usr/local/bin/jq
+ RUN chmod +x /usr/local/bin/jq
+
 RUN mkdir -p /io /src /work $REFS_DIR $H5_DIR
-ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5 $H5_DIR/
 
 COPY requirements.txt /requirements.txt
 RUN pip install --quiet -r requirements.txt
diff --git a/docker/internal_extract.sh b/docker/internal_extract.sh
index 95ab6d8e..e7bd28e4 100755
--- a/docker/internal_extract.sh
+++ b/docker/internal_extract.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 set -euf -o pipefail
+set -o xtrace
 
 if [ "$#" -ne 2 ]; then
     echo "Usage $0 script_file mysql_password"
@@ -12,13 +13,27 @@ apt-get update
 apt-get install -y python3 python3-pip
 pip3 install mysql-to-sqlite3
 
+while ! grep "mysqld: Shutdown complete" /var/log/mysql/error.log; do
+    echo 'still waiting for mysql server to finish loading data...'
+    sleep 5
+done
+
+sleep 30  # shouldn't be needed, but not working without it
+
 # wait for mysql to be ready for connections
 while ! mysqladmin ping "--password=$2" --silent; do
-    sleep 1
+    echo 'still waiting for mysql server to be ready...'
+    sleep 5
 done
 
 # reduce database contents using SQL script
-mysql "--password=$2" meta_atlas < "/script/$1"
+# this is a hack - we rerun the filter step until it works
+# this is because we don't have a good check for when the database is
+# truely ready. But this does eventually work.
+while ! mysql "--password=$2" meta_atlas < "/script/$1"; do
+    echo "Database filtering failed. Trying again..."
+    sleep 15
+done
 
 TIMESTAMP="$(date "+%Y-%m-%d-%H-%M")"
 
diff --git a/docker/mysql_to_sqlite_filtered.sh b/docker/mysql_to_sqlite_filtered.sh
index 97d2c0fc..95e4d07f 100755
--- a/docker/mysql_to_sqlite_filtered.sh
+++ b/docker/mysql_to_sqlite_filtered.sh
@@ -1,6 +1,13 @@
 #!/bin/bash
-
 set -euf -o pipefail
+set -o xtrace
+
+# dumps production DB on nerscdb04.nersc.gov
+# loads the dump into a local, dockerized mysql instance
+# transforms (filters) the database using script_file
+# exports the database to a sqlite3 file
+# uploads the sqlite3 file to a web accessible directory
+
 
 if [ "$#" -ne 2 ]; then
     echo "Usage $0 script_file mysql_password"
@@ -9,16 +16,18 @@ fi
 
 # cat /global/cfs/cdirs/metatlas/mysql_user.txt
 
+rm -rf dumps
+mkdir -p dumps
+
 TIMESTAMP="$(date "+%Y-%m-%d-%H-%M")"
 DUMP_FILE="meta_atlas_all_dbs-${TIMESTAMP}.sql"
 ssh cori.nersc.gov shifter --image=docker:mysql/mysql-server:5.7.14 mysqldump \
 	-h nerscdb04.nersc.gov -u meta_atlas_admin --all-databases \
 	--set-gtid-purged=OFF "--result-file=$DUMP_FILE"
-mkdir -p dumps
-rm -rf dump/*
 scp "dtn01.nersc.gov:$DUMP_FILE" "dumps/$DUMP_FILE"
-MYSQL_ID="$(docker run --rm -e "MYSQL_ROOT_PASSWORD=$2" -v "$(pwd)/dumps:/docker-entrypoint-initdb.d" -v "$(pwd):/script" mysql:5.7)"
+ssh cori.nersc.gov rm "$DUMP_FILE"
+MYSQL_ID="$(docker run -d --rm -e "MYSQL_ROOT_PASSWORD=$2" -v "$(pwd)/dumps:/docker-entrypoint-initdb.d" -v "$(pwd):/script" mysql:5.7)"
 docker exec -it "$MYSQL_ID" /script/internal_extract.sh "$1" "$2"
 docker stop "$MYSQL_ID"
-SQLITE=$(ls -lt1 "dumps/*.sqlite3" | head -1)
-scp -C "dumps/$SQLITE" dtn01.nersc.gov:/global/cfs/cdirs/m2650/www/metatlas/test_data/
+SQLITE=$(find ~+/dumps -name '*.sqlite3' | head -1)
+scp -C "$SQLITE" dtn01.nersc.gov:/global/cfs/cdirs/m2650/www/metatlas/test_data/
diff --git a/docker/rt_predict_test_case_from_db.sql b/docker/rt_predict_test_case_from_db.sql
index fb4cc153..f13dfb0e 100644
--- a/docker/rt_predict_test_case_from_db.sql
+++ b/docker/rt_predict_test_case_from_db.sql
@@ -55,7 +55,7 @@ LEFT JOIN (
         JOIN (
                 SELECT MAX(creation_time) AS ctime, hdf5_file
                 FROM lcmsruns
-                WHERE (name LIKE '20201106\_JGI-AK\_PS-KM\_505892\_OakGall\_final\_QE-HF\_HILICZ\_USHXG01583\_POS\_MSMS\_0\_QC\_P%')
+                WHERE (name LIKE '20201106\_JGI-AK\_PS-KM\_505892\_OakGall\_final\_QE-HF\_HILICZ\_USHXG01583\_%\_QC\_%')
                 GROUP BY hdf5_file
         ) AS early
         ON l1.creation_time=early.ctime AND l1.hdf5_file=early.hdf5_file

From 2e64d2e5faaace3b759b17d90fe71fd31dd70c1f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 11 Aug 2021 11:17:31 -0700
Subject: [PATCH 080/177] Add tests for predict_rts get_rts()

---
 metatlas/datastructures/metatlas_dataset.py |   6 +-
 tests/unit/conftest.py                      | 144 ++++++++++++++++++++
 tests/unit/test_predict_rt.py               |  34 +++++
 3 files changed, 183 insertions(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index aef79dcf..0ff66e31 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -152,13 +152,17 @@ def lcmsruns(self):
         if self._lcmsruns is not None:
             return self._lcmsruns
         all_lcmsruns = dp.get_metatlas_files(experiment=self.experiment, name="%")
-        self._lcmsruns = [r for r in all_lcmsruns if not any(map(r.name.__contains__, self.exclude_files))]
         if len(self.exclude_files) > 0:
+            self._lcmsruns = [
+                r for r in all_lcmsruns if not any(map(r.name.__contains__, self.exclude_files))
+            ]
             logger.info(
                 "Excluding %d LCMS runs containing any of: %s",
                 len(all_lcmsruns) - len(self._lcmsruns),
                 self.exclude_files,
             )
+        else:
+            self._lcmsruns = all_lcmsruns
         for run in self._lcmsruns:
             logger.info("Run: %s", run.name)
         logger.info("Number of LCMS output files matching '%s' is: %d.", self.experiment, len(self._lcmsruns))
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index cf970e80..1f4978da 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -10,6 +10,8 @@
 import sqlite3
 import threading
 
+from datetime import datetime
+
 import pytest
 import numpy as np
 import pandas as pd
@@ -22,6 +24,10 @@
 logger = logging.getLogger(__name__)
 
 
+def date_str_to_int(date_str):
+    return int(datetime.fromisoformat(date_str).timestamp())
+
+
 @pytest.fixture(name="username", scope="session")
 def fixture_username():
     return getpass.getuser()
@@ -563,6 +569,17 @@ def fixture_metatlas_dataset_with_2_cids(
     return mads.MetatlasDataset(ids=analysis_ids_with_2_cids, save_metadata=False)
 
 
+@pytest.fixture(name="metatlas_dataset_with_qc_runs")
+def fixture_metatlas_dataset_with_qc_runs(
+    mocker, df_container, analysis_ids, lcmsrun, sqlite_with_atlas, qc_lcmsruns
+):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=qc_lcmsruns)
+    return mads.MetatlasDataset(ids=analysis_ids, save_metadata=False)
+
+
 @pytest.fixture(name="eic")
 def fixture_eic():
     return {
@@ -1104,6 +1121,133 @@ def fixture_lcmsrun(username):
     return run
 
 
+@pytest.fixture(name="qc_lcmsruns")
+def fixture_qc_lcmsruns(username):
+    json = [
+        {
+            "acquisition_time": 1604734158,
+            "creation_time": date_str_to_int("2020-11-13T16:05:46"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5",
+            "head_id": "c0459a277f654fdeacf48243a34207b4",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:40:27"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "c0459a277f654fdeacf48243a34207b4",
+            "username": username,
+        },
+        {
+            "acquisition_time": 1605168081,
+            "creation_time": date_str_to_int("2020-11-13T15:57:27"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.h5",
+            "head_id": "9f33a0c1793e46fc9c70a19b587a0117",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:39:25"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "9f33a0c1793e46fc9c70a19b587a0117",
+            "username": username,
+        },
+        {
+            "acquisition_time": 1605166749,
+            "creation_time": date_str_to_int("2020-11-13T15:42:04"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5",
+            "head_id": "8c93ee10f2af4238ae905d86debc87ce",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:40:27"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "8c93ee10f2af4238ae905d86debc87ce",
+            "username": username,
+        },
+        {
+            "acquisition_time": 1604735488,
+            "creation_time": date_str_to_int("2020-11-13T15:52:48"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.h5",
+            "head_id": "855e0081dbb2473c8970f40db129d8f7",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:39:25"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "855e0081dbb2473c8970f40db129d8f7",
+            "username": username,
+        },
+        {
+            "acquisition_time": 1605165417,
+            "creation_time": date_str_to_int("2020-11-13T16:03:25"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5",
+            "head_id": "58905ea702f44d9199be928bc46fdb20",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:38:49"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "58905ea702f44d9199be928bc46fdb20",
+            "username": username,
+        },
+        {
+            "acquisition_time": 1604732826,
+            "creation_time": date_str_to_int("2020-11-13T16:15:04"),
+            "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+            "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.mzML",
+            "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+            "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5",
+            "head_id": "392b1a859ed54e07bc34b55e06459db2",
+            "injection_volume": 0.0,
+            "injection_volume_units": "uL",
+            "last_modified": date_str_to_int("2021-02-16T19:38:49"),
+            "method": None,
+            "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.mzML",
+            "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.mzML",
+            "pass_qc": False,
+            "prev_uid": "origin",
+            "sample": None,
+            "unique_id": "392b1a859ed54e07bc34b55e06459db2",
+            "username": username,
+        },
+    ]
+    return [metob.LcmsRun(**run) for run in json]
+
+
 @pytest.fixture(name="group")
 def fixture_group(lcmsrun, username):
     grp = metob.Group()
diff --git a/tests/unit/test_predict_rt.py b/tests/unit/test_predict_rt.py
index 2a3affe1..9052ab6b 100644
--- a/tests/unit/test_predict_rt.py
+++ b/tests/unit/test_predict_rt.py
@@ -1,2 +1,36 @@
 """ unit testing of predict_rt functions """
 # pylint: disable=missing-function-docstring
+
+import os
+
+from metatlas.datastructures import metatlas_dataset as mads
+from metatlas.tools import predict_rt
+
+
+def test_get_rts01(metatlas_dataset):
+    rts_df = predict_rt.get_rts(metatlas_dataset, include_atlas_rt_peak=False)
+    assert f"{rts_df.iloc[0]['min']:0.5f}" == "2.29224"
+
+
+def test_get_rts02(
+    mocker, df_container, analysis_ids, qc_lcmsruns, sqlite_with_atlas, username, groups_controlled_vocab
+):
+    mocker.patch(
+        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
+    )
+    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=qc_lcmsruns)
+    ids = mads.AnalysisIdentifiers(
+        source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+        output_type="FinalEMA-HILIC",
+        polarity="positive",
+        analysis_number=0,
+        project_directory=str(os.getcwd()),
+        groups_controlled_vocab=groups_controlled_vocab,
+    )
+    metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
+    rts_df = predict_rt.get_rts(metatlas_dataset, include_atlas_rt_peak=False)
+    assert (
+        rts_df.to_json()
+        == """{"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"mean":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"median":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"min":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"max":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"standard deviation":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0.0},"standard error":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0.0},"#NaNs":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0}}"""
+    )

From 975174ff8a35c2163aaac248f6c78c436a19b7ba Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 11 Aug 2021 11:18:11 -0700
Subject: [PATCH 081/177] blacken code

---
 metatlas/io/rclone.py        | 4 ++--
 metatlas/tools/predict_rt.py | 7 +------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
index 7707a313..65385945 100644
--- a/metatlas/io/rclone.py
+++ b/metatlas/io/rclone.py
@@ -20,7 +20,7 @@ def config_file(self):
             result = subprocess.check_output(f"{self.rclone_path} config file", text=True)
         except (subprocess.CalledProcessError, FileNotFoundError):
             return None
-        return result.split('\n')[1]
+        return result.split("\n")[1]
 
     def get_name_for_id(self, identifier):
         """
@@ -55,4 +55,4 @@ def copy_to_drive(self, source, drive, dest_path=None):
             logger.exception(err)
             raise err
         except FileNotFoundError:
-            logger.info('rclone not found. Skipping transfer to Google Drive')
+            logger.info("rclone not found. Skipping transfer to Google Drive")
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index f1623077..02b3d058 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -90,12 +90,7 @@ def predict(self, x_values):
 
 
 def generate_rt_correction_models(
-    ids,
-    cpus,
-    repo_dir,
-    save_to_db=True,
-    use_poly_model=True,
-    model_only=False
+    ids, cpus, repo_dir, save_to_db=True, use_poly_model=True, model_only=False
 ):
     """
     Generate the RT correction models and associated atlases with adjusted RT values

From c27e2e189a70e96bef38f009f9d20306d03be2a4 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 11:19:26 -0700
Subject: [PATCH 082/177] Refactor get_data_for_atlas_and_lcmsrun()

---
 metatlas/io/metatlas_get_data_helper_fun.py   | 112 +++-----
 .../unit/test_metatlas_get_data_helper_fun.py | 258 ++++++++++++++++++
 2 files changed, 302 insertions(+), 68 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index bd311714..a9ef7187 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -407,6 +407,7 @@ def retrieve_most_intense_msms_scan(data):
     msms_data['precursor_intensity'] = pintensity
     return msms_data
 
+
 def get_data_for_atlas_and_lcmsrun(atlas_df, df_container, extra_time, extra_mz):
     '''
     Accepts
@@ -415,74 +416,49 @@ def get_data_for_atlas_and_lcmsrun(atlas_df, df_container, extra_time, extra_mz)
 
     Returns python dictionaries of ms1, eic, and ms2 results for each compound in the atlas dataframe.
     '''
-
-    #filtered the ms2 and ms1 pos and neg frames in the container by rt and mz extreme points.
-    filtered_ms1_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_pos'],
-                                                               atlas_df[atlas_df.detected_polarity == 'positive'].rt_max.max(),
-                                                               atlas_df[atlas_df.detected_polarity == 'positive'].rt_min.min(),
-                                                               0,
-                                                               #atlas_df[atlas_df.detected_polarity == 'positive'].mz.min()-1,
-                                                               atlas_df[atlas_df.detected_polarity == 'positive'].mz.max()+1,
-                                                               extra_time = extra_time,
-                                                               extra_mz = extra_mz)
-    filtered_ms1_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_neg'],
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].rt_max.max(),
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].rt_min.min(),
-                                                           0,
-                                                           #atlas_df[atlas_df.detected_polarity == 'negative'].mz.min()-1,
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].mz.max()+1,
-                                                           extra_time = extra_time,
-                                                           extra_mz = extra_mz)
-    filtered_ms2_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_pos'],
-                                                           atlas_df[atlas_df.detected_polarity == 'positive'].rt_max.max(),
-                                                           atlas_df[atlas_df.detected_polarity == 'positive'].rt_min.min(),
-                                                           0,
-                                                           #atlas_df[atlas_df.detected_polarity == 'positive'].mz.min()-1,
-                                                           atlas_df[atlas_df.detected_polarity == 'positive'].mz.max()+1,
-                                                           extra_time = extra_time,
-                                                           extra_mz = extra_mz)
-
-    filtered_ms2_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_neg'],
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].rt_max.max(),
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].rt_min.min(),
-                                                           0,
-                                                           #atlas_df[atlas_df.detected_polarity == 'negative'].mz.min()-1,
-                                                           atlas_df[atlas_df.detected_polarity == 'negative'].mz.max()+1,
-                                                           extra_time = extra_time,
-                                                           extra_mz = extra_mz)
-
-
-    ms1_feature_data = atlas_df.apply(lambda x: get_data_for_mzrt(x,filtered_ms1_pos,filtered_ms1_neg, extra_time=extra_time, extra_mz = extra_mz),axis=1)
-    ms1_summary = ms1_feature_data.apply(get_ms1_summary,axis=1)
-    #if ms1_summary.size == 0:
-    #    return [],[],[]
-    if ms1_feature_data.shape[1] == 0:
-        return None,None,None
-    else:
-        ms1_eic = ms1_feature_data.apply(get_ms1_eic,axis=1)
-    #print ms1_eic
-        ms2_feature_data = atlas_df.apply(lambda x: get_data_for_mzrt(x,filtered_ms2_pos,filtered_ms2_neg,use_mz = 'precursor_MZ', extra_mz = extra_mz, extra_time=extra_time),axis=1)
-        ms2_data = ms2_feature_data.apply(get_ms2_data,axis=1)
-        dict_ms1_summary = [dict(row) for i,row in ms1_summary.iterrows()]
-
-    dict_eic = []
-    for i,row in ms1_eic.iterrows():
-        dict_eic.append(row.eic.T.to_dict(orient='list'))
-
-    #rename the "i" to "intensity".
-    for i,d in enumerate(dict_eic):
-        dict_eic[i]['intensity'] = dict_eic[i].pop('i')
-
-    dict_ms2 = []
-    for i,row in ms2_data.iterrows():
-        if 'ms2_datapoints' in list(row.keys()):
-            dict_ms2.append(row.ms2_datapoints.T.to_dict(orient='list'))
-        else:
-            dict_ms2.append([])
-
-    return dict_ms1_summary,dict_eic,dict_ms2
-
-
+    # filtered the ms2 and ms1 pos and neg frames in the container by rt and mz extreme points.
+    is_pos = atlas_df.detected_polarity == 'positive'
+    is_neg = atlas_df.detected_polarity == 'negative'
+    pos_filter_params = [atlas_df[is_pos].rt_max.max(), atlas_df[is_pos].rt_min.min(), 0,
+                         atlas_df[is_pos].mz.max()+1, extra_time, extra_mz]
+    neg_filter_params = [atlas_df[is_neg].rt_max.max(), atlas_df[is_neg].rt_min.min(), 0,
+                         atlas_df[is_neg].mz.max()+1, extra_time, extra_mz]
+    filtered_ms1_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_pos'], *pos_filter_params)
+    filtered_ms1_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_neg'], *neg_filter_params)
+    filtered_ms2_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_pos'], *pos_filter_params)
+    filtered_ms2_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_neg'], *neg_filter_params)
+
+    def get_feature_data(atlas_df, pos_df, neg_df, use_mz='mz'):
+        return atlas_df.apply(
+            lambda x: get_data_for_mzrt(x, pos_df, neg_df, extra_time, use_mz, extra_mz), axis=1
+        )
+    ms1_features = get_feature_data(atlas_df, filtered_ms1_pos, filtered_ms1_neg)
+    if ms1_features.shape[1] == 0:
+        return None, None, None
+    ms2_features = get_feature_data(atlas_df, filtered_ms2_pos, filtered_ms2_neg, use_mz='precursor_MZ')
+    return get_ms1_summary_data(ms1_features), get_eic_data(ms1_features), get_ms2_dict(ms2_features)
+
+
+def get_ms2_dict(ms2_feature_data_df):
+    """ extract a dict of ms2 data from the ms2 dataframe """
+    ms2_data = ms2_feature_data_df.apply(get_ms2_data, axis=1)
+    return [row.ms2_datapoints.T.to_dict(orient='list') if 'ms2_datapoints' in list(row.keys()) else []
+            for _, row in ms2_data.iterrows()]
+
+
+def get_ms1_summary_data(ms1_feature_data_df):
+    """ extract a list of ms1 data from the ms1 dataframe """
+    ms1_summary = ms1_feature_data_df.apply(get_ms1_summary, axis=1)
+    return [dict(row) for _, row in ms1_summary.iterrows()]
+
+
+def get_eic_data(ms1_feature_data_df):
+    """ extract a list of eic data from the ms1 dataframe """
+    ms1_eic = ms1_feature_data_df.apply(get_ms1_eic, axis=1)
+    dict_eic = [row.eic.T.to_dict(orient='list') for _, row in ms1_eic.iterrows()]
+    for _, value in enumerate(dict_eic):
+        value['intensity'] = value.pop('i')  # rename the "i" to "intensity"
+    return dict_eic
 
 
 def get_unique_scan_data(data):
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 20d62778..3f5f2729 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -111,3 +111,261 @@ def test_extract_metatlas_dataset02(metatlas_dataset):
 
 def test_extract_metatlas_dataset03(metatlas_dataset):
     assert gdhf.extract(metatlas_dataset, ["foo"], "zoop") == "zoop"
+
+
+def test_get_data_for_atlas_and_lcmsrun(atlas_df, df_container):
+    result = gdhf.get_data_for_atlas_and_lcmsrun(atlas_df, df_container, 0.75, 0)
+    assert result[0] == [
+        {
+            "num_ms1_datapoints": 74.0,
+            "mz_peak": 252.1090087891,
+            "rt_peak": 2.2922415733,
+            "mz_centroid": 252.10896296693303,
+            "rt_centroid": 2.2720730579808084,
+            "peak_height": 2359861.25,
+            "peak_area": 57016800.755859375,
+        }
+    ]
+    assert result[1] == [
+        {
+            "mz": [
+                252.1089324951,
+                252.1090087891,
+                252.1088104248,
+                252.1090087891,
+                252.10887146,
+                252.1089324951,
+                252.1089324951,
+                252.1088256836,
+                252.1088867188,
+                252.1090393066,
+                252.1089782715,
+                252.1089630127,
+                252.1089630127,
+                252.1089782715,
+                252.1090240479,
+                252.1089782715,
+                252.1090240479,
+                252.1089324951,
+                252.1090393066,
+                252.1088867188,
+                252.10887146,
+                252.1089324951,
+                252.1089630127,
+                252.1089935303,
+                252.1089172363,
+                252.1089477539,
+                252.1090545654,
+                252.1089630127,
+                252.1090240479,
+                252.1090087891,
+                252.1090393066,
+                252.1090240479,
+                252.1089935303,
+                252.1090240479,
+                252.1089630127,
+                252.1090087891,
+                252.1090240479,
+                252.1089172363,
+                252.1089019775,
+                252.1089477539,
+                252.1089324951,
+                252.1089477539,
+                252.1089477539,
+                252.1089477539,
+                252.1089782715,
+                252.1088867188,
+                252.1089172363,
+                252.1089324951,
+                252.1089782715,
+                252.1089477539,
+                252.1089172363,
+                252.1089324951,
+                252.1089630127,
+                252.1088867188,
+                252.1089630127,
+                252.1085205078,
+                252.1090545654,
+                252.1089935303,
+                252.1088104248,
+                252.1086578369,
+                252.1089935303,
+                252.1085510254,
+                252.1082763672,
+                252.1082458496,
+                252.1084136963,
+                252.1092224121,
+                252.1091766357,
+                252.1092834473,
+                252.1087493896,
+                252.1112518311,
+                252.1088409424,
+                252.1086425781,
+                252.1091766357,
+                252.1094055176,
+            ],
+            "rt": [
+                2.1030805111,
+                2.1084616184,
+                2.1139531136,
+                2.1193552017,
+                2.1248509884,
+                2.1302509308,
+                2.135682106,
+                2.1411821842,
+                2.1459801197,
+                2.1513926983,
+                2.1568279266,
+                2.1622362137,
+                2.1676549911,
+                2.1730883121,
+                2.179015398,
+                2.1845297813,
+                2.1900422573,
+                2.1949694157,
+                2.20002985,
+                2.2055358887,
+                2.2110378742,
+                2.2165191174,
+                2.2219588757,
+                2.2273921967,
+                2.2328462601,
+                2.2382712364,
+                2.2437169552,
+                2.2492566109,
+                2.2547125816,
+                2.2601687908,
+                2.2656960487,
+                2.2704958916,
+                2.2758042812,
+                2.2813498974,
+                2.2868082523,
+                2.2922415733,
+                2.2976748943,
+                2.3031060696,
+                2.308131218,
+                2.313628912,
+                2.3185498714,
+                2.3239560127,
+                2.3293914795,
+                2.3349123001,
+                2.3403663635,
+                2.346799612,
+                2.3522267342,
+                2.3576600552,
+                2.3631224632,
+                2.3685662746,
+                2.3740911484,
+                2.3794057369,
+                2.3848536015,
+                2.3903660774,
+                2.3953785896,
+                2.4006638527,
+                2.4062638283,
+                2.411709547,
+                2.4171659946,
+                2.4226117134,
+                2.4302260876,
+                2.4357616901,
+                2.4407405853,
+                2.4461927414,
+                2.451615572,
+                2.4571509361,
+                2.4627010822,
+                2.4681572914,
+                2.4735822678,
+                2.4735822678,
+                2.4787945747,
+                2.4842174053,
+                2.4896612167,
+                2.495146513,
+            ],
+            "intensity": [
+                312203.5,
+                387914.59375,
+                308308.5,
+                334653.59375,
+                339521.625,
+                345527.21875,
+                292437.34375,
+                413614.53125,
+                300285.28125,
+                383848.71875,
+                404313.21875,
+                377231.34375,
+                453965.5625,
+                431327.0,
+                523180.0625,
+                510239.8125,
+                631459.1875,
+                807419.5,
+                842647.5625,
+                1053031.625,
+                1082361.625,
+                1198966.625,
+                1109162.375,
+                1126347.125,
+                1373071.5,
+                1589018.375,
+                1281309.875,
+                1660166.75,
+                1492912.25,
+                2029801.5,
+                2029874.125,
+                2035966.625,
+                2010867.875,
+                2036981.375,
+                2148879.25,
+                2359861.25,
+                2054066.125,
+                1691976.0,
+                1778159.125,
+                1776166.125,
+                1752154.125,
+                1575676.875,
+                1199910.625,
+                1259708.25,
+                1087384.375,
+                826077.125,
+                802296.875,
+                547785.125,
+                545340.0625,
+                584624.4375,
+                468524.8125,
+                305931.1875,
+                330310.34375,
+                309740.625,
+                289212.71875,
+                230440.9375,
+                210549.390625,
+                169972.390625,
+                140521.234375,
+                116637.953125,
+                117197.625,
+                84652.1171875,
+                117615.578125,
+                103500.921875,
+                89320.9453125,
+                76313.9296875,
+                55575.00390625,
+                76784.6796875,
+                28829.162109375,
+                26051.6171875,
+                42957.18359375,
+                50342.6953125,
+                37611.33984375,
+                38202.83203125,
+            ],
+        }
+    ]
+    assert result[2] == [
+        {
+            "mz": [252.1087036133, 252.1572875977, 252.1090698242, 252.1557617188],
+            "i": [93112.0859375, 7624.11328125, 76976.7265625, 6090.6440429688],
+            "rt": [2.2203779221, 2.2203779221, 2.3452186584, 2.3452186584],
+            "polarity": [1.0, 1.0, 1.0, 1.0],
+            "precursor_MZ": [252.10887146, 252.10887146, 252.1089477539, 252.1089477539],
+            "precursor_intensity": [2872807.5, 2872807.5, 3046732.75, 3046732.75],
+            "collision_energy": [23.3333339691, 23.3333339691, 23.3333339691, 23.3333339691],
+        }
+    ]

From b5b21a0bbeface18f530d7abdbfc7d4676cd6b45 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 15:26:46 -0700
Subject: [PATCH 083/177] remove dummy file

---
 metatlas/io/hello.py | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 metatlas/io/hello.py

diff --git a/metatlas/io/hello.py b/metatlas/io/hello.py
deleted file mode 100644
index 82aa5cda..00000000
--- a/metatlas/io/hello.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from __future__ import print_function
-print('hello')

From 24a4287df7b07eda7a0d2dbfd7b606284eb71d3d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 16:32:34 -0700
Subject: [PATCH 084/177] Refactor get_data_for_atlas_df_and_file()

---
 metatlas/io/metatlas_get_data_helper_fun.py   | 101 ++---
 .../unit/test_metatlas_get_data_helper_fun.py | 401 ++++++++++++++++++
 2 files changed, 430 insertions(+), 72 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index a9ef7187..b9a7204e 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import logging
 import numpy as np
 import os.path
@@ -60,57 +58,29 @@ def compare_EIC_to_BPC_for_file(metatlas_dataset,file_index,yscale = 'linear'):
     plt.close(fig)
     return fig
 
+
 def get_data_for_atlas_df_and_file(input_tuple):
-    my_file = input_tuple[0]
-    my_group = input_tuple[1]
-    atlas_df = input_tuple[2]
-    myAtlas = input_tuple[3]
-    extra_time = 0.5
-    extra_mz = 0.0
-    if len(input_tuple) == 6:
-        extra_time = input_tuple[4]
-        extra_mz = input_tuple[5]
-    elif len(input_tuple) == 5:
-        extra_time = input_tuple[4]
-
-    df_container = df_container_from_metatlas_file(my_file)
-
-    df_container = remove_ms1_data_not_in_atlas(atlas_df,df_container)
-    dict_ms1_summary,dict_eic,dict_ms2 = get_data_for_atlas_and_lcmsrun(atlas_df,df_container,extra_time, extra_mz)
+    my_file, group, atlas_df, atlas = input_tuple[:4]
+    extra_time = input_tuple[4] if len(input_tuple) >= 5 else 0.5
+    extra_mz = input_tuple[5] if len(input_tuple) == 6 else 0.0
+    df_container = remove_ms1_data_not_in_atlas(atlas_df, df_container_from_metatlas_file(my_file))
+    dict_ms1_summary, dict_eic, dict_ms2 = get_data_for_atlas_and_lcmsrun(atlas_df, df_container,
+                                                                          extra_time, extra_mz)
     row = []
     for i in range(atlas_df.shape[0]):
-        result = {}
-        result['atlas_name'] = myAtlas.name
-        result['atlas_unique_id'] = myAtlas.unique_id
-        result['lcmsrun'] = my_file
-        result['group'] = my_group
-        temp_compound = copy.deepcopy(myAtlas.compound_identifications[i])
-        result['identification'] = temp_compound
-        result['data'] = {}
-        if dict_eic:
-            result['data']['eic'] = dict_eic[i]
-        else:
-            result['data']['eic'] = None
-        if dict_ms1_summary:
-            result['data']['ms1_summary'] = dict_ms1_summary[i]
-        else:
-            result['data']['ms1_summary'] = None
-
-        result['data']['msms'] = {}
+        result = {'atlas_name': atlas.name, 'atlas_unique_id': atlas.unique_id, 'lcmsrun': my_file,
+                  'group': group, 'identification': copy.deepcopy(atlas.compound_identifications[i])}
+        result['data'] = {'msms': {}, 'eic': dict_eic[i] if dict_eic else None}
+        result['data']['ms1_summary'] = dict_ms1_summary[i] if dict_ms1_summary else None
         if dict_ms2:
-            if len(dict_ms2[i])>0:#dict_ms2[i]['mz']:
-                for k in dict_ms2[0].keys():
-                    dict_ms2[i][k] = np.asarray(dict_ms2[i][k])
-        #                 if temp_compound.mz_references[0].observed_polarity == 'positive':
-        #                     dict_ms2[i]['polarity'] = dict_ms2[i]['mz'] * 0.0 + 1.0
-        #                 else:
-        #                     dict_ms2[i]['polarity'] = dict_ms2[i]['mz'] * 0.0
-                result['data']['msms']['data'] = dict_ms2[i]
+            if len(dict_ms2[i]) > 0:
+                result['data']['msms']['data'] = {key: np.asarray(val) for key, val in dict_ms2[i].items()}
         else:
             result['data']['msms']['data'] = []
         row.append(result)
     return tuple(row)
 
+
 def get_bpc(filename,dataset='ms1_pos',integration='bpc'):
     """
     Gets the basepeak chromatogram for a file.
@@ -318,40 +288,27 @@ def get_data_for_mzrt(row,data_df_pos,data_df_neg,extra_time = 0.5,use_mz = 'mz'
     return_df = pd.Series({'padded_feature_data':all_df,'in_feature':(all_df.rt >= row.rt_min) & (all_df.rt <= row.rt_max)})
     return return_df
 
+
 def get_ms1_summary(row):
-    #A DataFrame of all points typically padded by "extra time"
+    # A DataFrame of all points typically padded by "extra time"
     all_df = row.padded_feature_data
-
-    #slice out ms1 data that is NOT padded by extra_time
-    ms1_df = all_df[(row.in_feature == True)]#[['i','mz','polarity','rt']]
-
+    # slice out ms1 data that is NOT padded by extra_time
+    ms1_df = all_df[(row.in_feature)]
     num_ms1_datapoints = ms1_df.shape[0]
-    if num_ms1_datapoints > 0:
-        idx = ms1_df.i.idxmax()
+    has_data = num_ms1_datapoints > 0
+    if has_data:
         ms1_peak_df = ms1_df.loc[ms1_df['i'].idxmax()]
-        mz_peak = ms1_peak_df.mz
-        rt_peak = ms1_peak_df.rt
-        mz_centroid = sum(ms1_df.mz * ms1_df.i) / sum(ms1_df.i)
-        rt_centroid = sum(ms1_df.rt * ms1_df.i) / sum(ms1_df.i)
-        peak_height = ms1_peak_df.i
         peak_area = sum(ms1_df.i)
-    else:
-        mz_peak = np.nan
-        rt_peak = np.nan
-        mz_centroid = np.nan
-        rt_centroid = np.nan
-        peak_height = np.nan
-        peak_area = np.nan
-
-    return_df = pd.Series({ 'num_ms1_datapoints':num_ms1_datapoints,
-                            'mz_peak':mz_peak,
-                            'rt_peak':rt_peak,
-                            'mz_centroid':mz_centroid,
-                            'rt_centroid':rt_centroid,
-                            'peak_height':peak_height,
-                            'peak_area':peak_area})
+    return pd.Series({
+        'num_ms1_datapoints': num_ms1_datapoints,
+        'mz_peak': ms1_peak_df.mz if has_data else np.nan,
+        'rt_peak': ms1_peak_df.rt if has_data else np.nan,
+        'mz_centroid': sum(ms1_df.mz * ms1_df.i) / peak_area if has_data else np.nan,
+        'rt_centroid': sum(ms1_df.rt * ms1_df.i) / peak_area if has_data else np.nan,
+        'peak_height': ms1_peak_df.i if has_data else np.nan,
+        'peak_area': peak_area if has_data else np.nan
+    })
 
-    return return_df
 
 def get_ms2_data(row):
     #A DataFrame of all points typically padded by "extra time"
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 3f5f2729..65c5baf4 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -1,6 +1,8 @@
 """ unit testing of helper functions """
 # pylint: disable=missing-function-docstring
 import pytest
+import numpy as np
+
 from metatlas.io import metatlas_get_data_helper_fun as gdhf
 
 
@@ -369,3 +371,402 @@ def test_get_data_for_atlas_and_lcmsrun(atlas_df, df_container):
             "collision_energy": [23.3333339691, 23.3333339691, 23.3333339691, 23.3333339691],
         }
     ]
+
+
+def test_get_data_for_atlas_df_and_file(lcmsrun, group, atlas_df, atlas, username):
+    result = gdhf.get_data_for_atlas_df_and_file((lcmsrun.hdf5_file, group, atlas_df, atlas))
+    expected = (
+        {
+            "atlas_name": f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
+            "atlas_unique_id": "749354f7ad974b288624dad533dcbeec",
+            "lcmsrun": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+            "group": {
+                "creation_time": "2021-05-04T09:41:17",
+                "description": "No description",
+                "head_id": "61041d07b5a24ca5b88efbda8f319654",
+                "items": [
+                    {
+                        "acquisition_time": 1604770080,
+                        "creation_time": "2020-11-13T15:58:43",
+                        "description": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 "
+                        "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML",
+                        "experiment": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
+                        "hdf5_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5",
+                        "head_id": "7ce51039cfca4426b4e51999ac45d018",
+                        "injection_volume": 0.0,
+                        "injection_volume_units": "uL",
+                        "last_modified": "2021-08-16T12:04:52",
+                        "method": None,
+                        "mzml_file": "/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML",
+                        "name": "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML",
+                        "pass_qc": False,
+                        "prev_uid": "beec4ed4d4b94190aa0776718b5f4fcb",
+                        "sample": None,
+                        "unique_id": "7ce51039cfca4426b4e51999ac45d018",
+                        "username": username,
+                    }
+                ],
+                "last_modified": "2021-05-04T09:41:17",
+                "name": f"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_{username}0_Cone-S1",
+                "prev_uid": "origin",
+                "short_name": "POS_Cone-S1",
+                "unique_id": "61041d07b5a24ca5b88efbda8f319654",
+                "username": username,
+            },
+            "identification": {
+                "compound": [
+                    {
+                        "chebi_id": "CHEBI:17256",
+                        "chebi_url": "http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256",
+                        "creation_time": "2016-06-17T18:13:15",
+                        "description": "A purine 2'-deoxyribonucleoside having adenine as the " "nucleobase.",
+                        "formula": "C10H13N5O3",
+                        "head_id": "60cd6743e56545c6a6cb066ec3553450",
+                        "hmdb_id": "HMDB00101",
+                        "hmdb_url": "http://www.hmdb.ca/metabolites/HMDB00101",
+                        "img_abc_id": "",
+                        "inchi": "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                        "inchi_key": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                        "iupac_name": "",
+                        "kegg_id": "C00559",
+                        "kegg_url": "http://www.genome.jp/dbget-bin/www_bget?C00559",
+                        "last_modified": "2021-08-16T12:04:52",
+                        "lipidmaps_id": "",
+                        "lipidmaps_url": "",
+                        "metacyc_id": "DEOXYADENOSINE",
+                        "mono_isotopic_molecular_weight": 251.101839276,
+                        "name": "2'-deoxyadenosine",
+                        "neutralized_2d_inchi": "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)",
+                        "neutralized_2d_inchi_key": "OLXZPDWKRNYJJZ-UHFFFAOYSA-N",
+                        "neutralized_inchi": "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+                        "neutralized_inchi_key": "OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+                        "num_free_radicals": 0,
+                        "number_components": 1,
+                        "permanent_charge": 0,
+                        "prev_uid": "35b9603309da430da318a0ab8c180457",
+                        "pubchem_compound_id": "13730",
+                        "pubchem_url": "http://pubchem.ncbi.nlm.nih.gov/compound/13730",
+                        "source": "gnps///chebi///metacyc///hmdb",
+                        "synonyms": "2'-deoxyadenosine",
+                        "unique_id": "60cd6743e56545c6a6cb066ec3553450",
+                        "username": username,
+                        "wikipedia_url": "",
+                    }
+                ],
+                "creation_time": "2021-02-10T16:20:49",
+                "description": "No description",
+                "do_normalization": False,
+                "frag_references": [],
+                "head_id": "18737c7141cc4efaa4545bead13ac751",
+                "identification_grade": None,
+                "identification_notes": "my id note",
+                "intensity_references": [],
+                "internal_standard_id": "",
+                "internal_standard_to_use": "",
+                "last_modified": "2021-08-16T12:04:52",
+                "ms1_notes": "keep",
+                "ms2_notes": "bad match to ref",
+                "mz_references": [
+                    {
+                        "adduct": "[M+H]+",
+                        "creation_time": "2021-02-10T16:20:50",
+                        "description": "No description",
+                        "detected_polarity": "positive",
+                        "enabled": True,
+                        "head_id": "eb6d03c9ef574051b92dad7b2fc259a2",
+                        "last_modified": "2021-08-16T12:04:52",
+                        "lcms_run": None,
+                        "modification": "",
+                        "mz": 252.1091393,
+                        "mz_tolerance": 20.0,
+                        "mz_tolerance_units": "ppm",
+                        "name": "Untitled",
+                        "observed_formula": "",
+                        "prev_uid": "18c3020ee0a8422cac1664e3b7d564c6",
+                        "ref_type": "",
+                        "unique_id": "eb6d03c9ef574051b92dad7b2fc259a2",
+                        "username": username,
+                    }
+                ],
+                "name": "2'-deoxyadenosine",
+                "prev_uid": "6d0d6618769b4efea1d1457fae2b3358",
+                "rt_references": [
+                    {
+                        "creation_time": "2021-02-10T16:20:50",
+                        "description": "No description",
+                        "enabled": True,
+                        "head_id": "a845ddfdf8ef4713bcef3bdb84999030",
+                        "last_modified": "2021-08-16T12:04:52",
+                        "lcms_run": None,
+                        "name": "Untitled",
+                        "prev_uid": "194498f6d0ac475c943130d51c3c036a",
+                        "ref_type": "",
+                        "rt_max": 2.6964640053707174,
+                        "rt_min": 1.6964640053707174,
+                        "rt_peak": 2.1964640053707174,
+                        "rt_units": "min",
+                        "unique_id": "a845ddfdf8ef4713bcef3bdb84999030",
+                        "username": username,
+                    }
+                ],
+                "unique_id": "18737c7141cc4efaa4545bead13ac751",
+                "username": username,
+            },
+            "data": {
+                "msms": {
+                    "data": {
+                        "mz": np.array([252.10870361, 252.1572876, 252.10906982, 252.15576172]),
+                        "i": np.array([93112.0859375, 7624.11328125, 76976.7265625, 6090.64404297]),
+                        "rt": np.array([2.22037792, 2.22037792, 2.34521866, 2.34521866]),
+                        "polarity": np.array([1.0, 1.0, 1.0, 1.0]),
+                        "precursor_MZ": np.array([252.10887146, 252.10887146, 252.10894775, 252.10894775]),
+                        "precursor_intensity": np.array([2872807.5, 2872807.5, 3046732.75, 3046732.75]),
+                        "collision_energy": np.array([23.33333397, 23.33333397, 23.33333397, 23.33333397]),
+                    }
+                },
+                "eic": {
+                    "mz": [
+                        252.1089324951,
+                        252.1090087891,
+                        252.1088104248,
+                        252.1090087891,
+                        252.10887146,
+                        252.1089324951,
+                        252.1089324951,
+                        252.1088256836,
+                        252.1088867188,
+                        252.1090393066,
+                        252.1089782715,
+                        252.1089630127,
+                        252.1089630127,
+                        252.1089782715,
+                        252.1090240479,
+                        252.1089782715,
+                        252.1090240479,
+                        252.1089324951,
+                        252.1090393066,
+                        252.1088867188,
+                        252.10887146,
+                        252.1089324951,
+                        252.1089630127,
+                        252.1089935303,
+                        252.1089172363,
+                        252.1089477539,
+                        252.1090545654,
+                        252.1089630127,
+                        252.1090240479,
+                        252.1090087891,
+                        252.1090393066,
+                        252.1090240479,
+                        252.1089935303,
+                        252.1090240479,
+                        252.1089630127,
+                        252.1090087891,
+                        252.1090240479,
+                        252.1089172363,
+                        252.1089019775,
+                        252.1089477539,
+                        252.1089324951,
+                        252.1089477539,
+                        252.1089477539,
+                        252.1089477539,
+                        252.1089782715,
+                        252.1088867188,
+                        252.1089172363,
+                        252.1089324951,
+                        252.1089782715,
+                        252.1089477539,
+                        252.1089172363,
+                        252.1089324951,
+                        252.1089630127,
+                        252.1088867188,
+                        252.1089630127,
+                        252.1085205078,
+                        252.1090545654,
+                        252.1089935303,
+                        252.1088104248,
+                        252.1086578369,
+                        252.1089935303,
+                        252.1085510254,
+                        252.1082763672,
+                        252.1082458496,
+                        252.1084136963,
+                        252.1092224121,
+                        252.1091766357,
+                        252.1092834473,
+                        252.1087493896,
+                        252.1112518311,
+                        252.1088409424,
+                        252.1086425781,
+                        252.1091766357,
+                        252.1094055176,
+                    ],
+                    "rt": [
+                        2.1030805111,
+                        2.1084616184,
+                        2.1139531136,
+                        2.1193552017,
+                        2.1248509884,
+                        2.1302509308,
+                        2.135682106,
+                        2.1411821842,
+                        2.1459801197,
+                        2.1513926983,
+                        2.1568279266,
+                        2.1622362137,
+                        2.1676549911,
+                        2.1730883121,
+                        2.179015398,
+                        2.1845297813,
+                        2.1900422573,
+                        2.1949694157,
+                        2.20002985,
+                        2.2055358887,
+                        2.2110378742,
+                        2.2165191174,
+                        2.2219588757,
+                        2.2273921967,
+                        2.2328462601,
+                        2.2382712364,
+                        2.2437169552,
+                        2.2492566109,
+                        2.2547125816,
+                        2.2601687908,
+                        2.2656960487,
+                        2.2704958916,
+                        2.2758042812,
+                        2.2813498974,
+                        2.2868082523,
+                        2.2922415733,
+                        2.2976748943,
+                        2.3031060696,
+                        2.308131218,
+                        2.313628912,
+                        2.3185498714,
+                        2.3239560127,
+                        2.3293914795,
+                        2.3349123001,
+                        2.3403663635,
+                        2.346799612,
+                        2.3522267342,
+                        2.3576600552,
+                        2.3631224632,
+                        2.3685662746,
+                        2.3740911484,
+                        2.3794057369,
+                        2.3848536015,
+                        2.3903660774,
+                        2.3953785896,
+                        2.4006638527,
+                        2.4062638283,
+                        2.411709547,
+                        2.4171659946,
+                        2.4226117134,
+                        2.4302260876,
+                        2.4357616901,
+                        2.4407405853,
+                        2.4461927414,
+                        2.451615572,
+                        2.4571509361,
+                        2.4627010822,
+                        2.4681572914,
+                        2.4735822678,
+                        2.4735822678,
+                        2.4787945747,
+                        2.4842174053,
+                        2.4896612167,
+                        2.495146513,
+                    ],
+                    "intensity": [
+                        312203.5,
+                        387914.59375,
+                        308308.5,
+                        334653.59375,
+                        339521.625,
+                        345527.21875,
+                        292437.34375,
+                        413614.53125,
+                        300285.28125,
+                        383848.71875,
+                        404313.21875,
+                        377231.34375,
+                        453965.5625,
+                        431327.0,
+                        523180.0625,
+                        510239.8125,
+                        631459.1875,
+                        807419.5,
+                        842647.5625,
+                        1053031.625,
+                        1082361.625,
+                        1198966.625,
+                        1109162.375,
+                        1126347.125,
+                        1373071.5,
+                        1589018.375,
+                        1281309.875,
+                        1660166.75,
+                        1492912.25,
+                        2029801.5,
+                        2029874.125,
+                        2035966.625,
+                        2010867.875,
+                        2036981.375,
+                        2148879.25,
+                        2359861.25,
+                        2054066.125,
+                        1691976.0,
+                        1778159.125,
+                        1776166.125,
+                        1752154.125,
+                        1575676.875,
+                        1199910.625,
+                        1259708.25,
+                        1087384.375,
+                        826077.125,
+                        802296.875,
+                        547785.125,
+                        545340.0625,
+                        584624.4375,
+                        468524.8125,
+                        305931.1875,
+                        330310.34375,
+                        309740.625,
+                        289212.71875,
+                        230440.9375,
+                        210549.390625,
+                        169972.390625,
+                        140521.234375,
+                        116637.953125,
+                        117197.625,
+                        84652.1171875,
+                        117615.578125,
+                        103500.921875,
+                        89320.9453125,
+                        76313.9296875,
+                        55575.00390625,
+                        76784.6796875,
+                        28829.162109375,
+                        26051.6171875,
+                        42957.18359375,
+                        50342.6953125,
+                        37611.33984375,
+                        38202.83203125,
+                    ],
+                },
+                "ms1_summary": {
+                    "num_ms1_datapoints": 74.0,
+                    "mz_peak": 252.1090087891,
+                    "rt_peak": 2.2922415733,
+                    "mz_centroid": 252.10896296693303,
+                    "rt_centroid": 2.2720730579808084,
+                    "peak_height": 2359861.25,
+                    "peak_area": 57016800.755859375,
+                },
+            },
+        },
+    )
+    assert len(result) == len(expected)
+    assert result[0].keys() == expected[0].keys()
+    for key in ["atlas_name", "lcmsrun", "data"]:
+        assert str(result[0][key]) == str(expected[0][key])

From e1e66a9cfa0e952ee198642b6a6d5f00c8bd47b1 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 16:47:17 -0700
Subject: [PATCH 085/177] clean up imports and refactor get_dill_data()

---
 metatlas/io/metatlas_get_data_helper_fun.py | 49 ++++++++-------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index b9a7204e..193c9b74 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -1,18 +1,23 @@
+import copy
 import logging
-import numpy as np
+import math
 import os.path
+import re
 import sys
-import copy
-import tables
-from metatlas.datastructures import metatlas_objects as metob
-from metatlas.io import write_utils
-import pandas as pd
+
+from collections import defaultdict
 from textwrap import wrap
+
+import dill
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import six
-from six.moves import map
-from six.moves import range
-from six.moves import zip
+import tables
+
+from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import h5_query as h5q
+from metatlas.io import write_utils
 
 logger = logging.getLogger(__name__)
 
@@ -328,7 +333,6 @@ def get_ms2_data(row):
 
 
 def prefilter_ms1_dataframe_with_boundaries(data_df, rt_max, rt_min, mz_min, mz_max, extra_time = 0.5, extra_mz = 0.01):
-    import math
     if (data_df.shape[0]==0) | (math.isnan(rt_max)):
         return []
     prefilter_query_str = 'rt <= %5.4f & rt >= %5.4f & mz >= %5.4f & mz <= %5.4f'%(rt_max+extra_time, rt_min-extra_time, mz_min-extra_mz, mz_max+extra_mz)
@@ -346,7 +350,6 @@ def get_ms1_eic(row):
 
 
 def retrieve_most_intense_msms_scan(data):
-    import numpy as np
     urt,idx = np.unique(data['rt'],return_index=True)
     sx = np.argsort(data['precursor_intensity'][idx])[::-1]
     prt = data['rt'][idx[sx]]
@@ -487,7 +490,6 @@ def organize_msms_scan_data(data,list_of_prt,list_of_pmz,list_of_pintensity):
     return msms_data
 
 def retrieve_most_intense_msms_scan(data):
-    import numpy as np
     urt,idx = np.unique(data['rt'],return_index=True)
     sx = np.argsort(data['precursor_intensity'][idx])[::-1]
     prt = data['rt'][idx[sx]]
@@ -528,9 +530,6 @@ def get_data_for_a_compound(mz_ref,rt_ref,what_to_get,h5file,extra_time):
     -------
     """
     #TODO : polarity should be handled in the experiment and not a loose parameter
-    import numpy as np
-    from metatlas.io import h5_query as h5q
-    import tables
 
     #get a pointer to the hdf5 file
     fid = tables.open_file(h5file) #TODO: should be a "with open:"
@@ -653,7 +652,6 @@ def get_data_for_a_compound(mz_ref,rt_ref,what_to_get,h5file,extra_time):
     return return_data
 
 
-
 def get_dill_data(fname):
     """
     Parameters
@@ -663,21 +661,15 @@ def get_dill_data(fname):
     Returns a list containing the data present in the dill file
     -------
     """
-    import dill
-
-    data = list()
-
     if os.path.exists(fname):
-        with open(fname,'r') as f:
+        with open(fname, 'r') as handle:
             try:
-                data = dill.load(f)
-            except IOError as e:
-                print(("I/O error({0}): {1}".format(e.errno, e.strerror)))
+                return dill.load(handle)
+            except IOError as err:
+                print(("I/O error({0}): {1}".format(err.errno, err.strerror)))
             except:  # handle other exceptions such as attribute errors
                 print(("Unexpected error:", sys.exc_info()[0]))
-
-
-    return data
+    return list()
 
 
 def get_group_names(data):
@@ -730,7 +722,6 @@ def get_file_names(data,full_path=False):
     Returns list containing the hdf file names present in the dill file
     -------
     """
-    import os.path
 
     # if data is a string then it's a file name - get its data
     if isinstance(data, six.string_types):
@@ -755,8 +746,6 @@ def get_compound_names(data,use_labels=False):
     Returns a tuple of lists containing the compound names and compound objects present in the dill file
     -------
     """
-    from collections import defaultdict
-    import re
 
     # if data is a string then it's a file name - get its data
     if isinstance(data, six.string_types):

From bd10947d1ae571a4c87aa22ca458e5255dc58eca Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 17:31:51 -0700
Subject: [PATCH 086/177] refactoring in metatlas_get_data_helper_fun

---
 metatlas/io/metatlas_get_data_helper_fun.py | 31 ++++++++++-----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 193c9b74..fab02711 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -332,12 +332,12 @@ def get_ms2_data(row):
     return return_df
 
 
-def prefilter_ms1_dataframe_with_boundaries(data_df, rt_max, rt_min, mz_min, mz_max, extra_time = 0.5, extra_mz = 0.01):
-    if (data_df.shape[0]==0) | (math.isnan(rt_max)):
+def prefilter_ms1_dataframe_with_boundaries(data_df, rt_max, rt_min, mz_min, mz_max, extra_time=0.5, extra_mz=0.01):
+    if (data_df.shape[0] == 0) | (math.isnan(rt_max)):
         return []
-    prefilter_query_str = 'rt <= %5.4f & rt >= %5.4f & mz >= %5.4f & mz <= %5.4f'%(rt_max+extra_time, rt_min-extra_time, mz_min-extra_mz, mz_max+extra_mz)
-    new_df = data_df.query(prefilter_query_str)
-    return new_df
+    return data_df.query(f"rt <= {rt_max+extra_time:5.4f} & rt >= {rt_min-extra_time:5.4f} "
+                         f"& mz >= {mz_min-extra_mz:5.4f} & mz <= {mz_max+extra_mz:5.4f}")
+
 
 def get_ms1_eic(row):
     #A DataFrame of all points typically padded by "extra time"
@@ -377,25 +377,22 @@ def get_data_for_atlas_and_lcmsrun(atlas_df, df_container, extra_time, extra_mz)
     Returns python dictionaries of ms1, eic, and ms2 results for each compound in the atlas dataframe.
     '''
     # filtered the ms2 and ms1 pos and neg frames in the container by rt and mz extreme points.
-    is_pos = atlas_df.detected_polarity == 'positive'
-    is_neg = atlas_df.detected_polarity == 'negative'
-    pos_filter_params = [atlas_df[is_pos].rt_max.max(), atlas_df[is_pos].rt_min.min(), 0,
-                         atlas_df[is_pos].mz.max()+1, extra_time, extra_mz]
-    neg_filter_params = [atlas_df[is_neg].rt_max.max(), atlas_df[is_neg].rt_min.min(), 0,
-                         atlas_df[is_neg].mz.max()+1, extra_time, extra_mz]
-    filtered_ms1_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_pos'], *pos_filter_params)
-    filtered_ms1_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms1_neg'], *neg_filter_params)
-    filtered_ms2_pos = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_pos'], *pos_filter_params)
-    filtered_ms2_neg = prefilter_ms1_dataframe_with_boundaries(df_container['ms2_neg'], *neg_filter_params)
+    filtered = {}
+    for level in ['ms1', 'ms2']:
+        for polarity in ['positive', 'negative']:
+            mode = f'{level}_{polarity[:3]}'
+            pol = atlas_df[atlas_df.detected_polarity == polarity]
+            params = [pol.rt_max.max(), pol.rt_min.min(), 0, pol.mz.max()+1, extra_time, extra_mz]
+            filtered[mode] = prefilter_ms1_dataframe_with_boundaries(df_container[mode], *params)
 
     def get_feature_data(atlas_df, pos_df, neg_df, use_mz='mz'):
         return atlas_df.apply(
             lambda x: get_data_for_mzrt(x, pos_df, neg_df, extra_time, use_mz, extra_mz), axis=1
         )
-    ms1_features = get_feature_data(atlas_df, filtered_ms1_pos, filtered_ms1_neg)
+    ms1_features = get_feature_data(atlas_df, filtered['ms1_pos'], filtered['ms1_neg'])
     if ms1_features.shape[1] == 0:
         return None, None, None
-    ms2_features = get_feature_data(atlas_df, filtered_ms2_pos, filtered_ms2_neg, use_mz='precursor_MZ')
+    ms2_features = get_feature_data(atlas_df, filtered['ms2_pos'], filtered['ms2_neg'], use_mz='precursor_MZ')
     return get_ms1_summary_data(ms1_features), get_eic_data(ms1_features), get_ms2_dict(ms2_features)
 
 

From 9eee2a6452be79f4a86278da3a6877cbfe54a1c6 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 16 Aug 2021 17:49:02 -0700
Subject: [PATCH 087/177] refactor get_data_for_mzrt()

---
 metatlas/io/metatlas_get_data_helper_fun.py | 31 +++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index fab02711..4d48ec7c 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -274,24 +274,19 @@ def transfer_identification_data_to_atlas(data, atlas, ids_list=None):
             set_nested(aci, ids, from_data)
     return out
 
-def get_data_for_mzrt(row,data_df_pos,data_df_neg,extra_time = 0.5,use_mz = 'mz',extra_mz = 0.0):
-    min_mz = '(%s >= %5.4f & '%(use_mz,row.mz - row.mz*row.mz_tolerance / 1e6 - extra_mz)
-    rt_min = 'rt >= %5.4f & '%(row.rt_min - extra_time)
-    rt_max = 'rt <= %5.4f & '%(row.rt_max + extra_time)
-    max_mz = '%s <= %5.4f)'%(use_mz,row.mz + row.mz*row.mz_tolerance / 1e6 + extra_mz)
-    ms1_query_str = '%s%s%s%s'%(min_mz,rt_min,rt_max,max_mz)
-    if row.detected_polarity == 'positive':
-        if len(data_df_pos)>0:
-            all_df = data_df_pos.query(ms1_query_str)
-        else:
-            return pd.Series(dtype=np.float64)
-    else:
-        if len(data_df_neg)>0:
-            all_df = data_df_neg.query(ms1_query_str)
-        else:
-            return pd.Series()
-    return_df = pd.Series({'padded_feature_data':all_df,'in_feature':(all_df.rt >= row.rt_min) & (all_df.rt <= row.rt_max)})
-    return return_df
+
+def get_data_for_mzrt(row, data_df_pos, data_df_neg, extra_time=0.5, use_mz='mz', extra_mz=0.0):
+    mz_min = '%s >= %5.4f' % (use_mz, row.mz - row.mz*row.mz_tolerance / 1e6 - extra_mz)
+    rt_min = 'rt >= %5.4f' % (row.rt_min - extra_time)
+    rt_max = 'rt <= %5.4f' % (row.rt_max + extra_time)
+    mz_max = '%s <= %5.4f' % (use_mz, row.mz + row.mz*row.mz_tolerance / 1e6 + extra_mz)
+    ms1_query_str = f"({mz_min} & {rt_min} & {rt_max} & {mz_max})"
+    data_df = data_df_pos if row.detected_polarity == 'positive' else data_df_neg
+    if len(data_df) == 0:
+        return pd.Series(dtype=np.float64)
+    all_df = data_df.query(ms1_query_str)
+    return pd.Series({'padded_feature_data': all_df,
+                      'in_feature': (all_df.rt >= row.rt_min) & (all_df.rt <= row.rt_max)})
 
 
 def get_ms1_summary(row):

From da60509e7f068d5f58a3371153e7030bd9b5f4c2 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 19 Aug 2021 16:06:43 -0700
Subject: [PATCH 088/177] Add system test of RT_Prediction notebook

---
 noxfile.py                      |   7 +-
 tests/system/test_rt_predict.py | 225 ++++++++++++++++++++++++++++++++
 2 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 tests/system/test_rt_predict.py

diff --git a/noxfile.py b/noxfile.py
index 019624f3..91c3360a 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -79,6 +79,7 @@
 REUSE_LARGE_VENV = True
 NB_LINE_LEN = 140
 
+
 @nox.session(python=py_versions[0])
 def flake8_diff(session):
     session.install(*flake8_deps)
@@ -128,7 +129,7 @@ def pylint_nb(session):
     # dupliate code cannot be disabled on per-cell level https://github.com/PyCQA/pylint/issues/214
     # Some duplicate code is required to setup the notebook and do error handling.
     # So turn off duplicate code for whole session -- not ideal.
-    session.run("nbqa", "pylint", "--disable=duplicate-code",  f"--max-line-length={NB_LINE_LEN}", *notebooks)
+    session.run("nbqa", "pylint", "--disable=duplicate-code", f"--max-line-length={NB_LINE_LEN}", *notebooks)
 
 
 @nox.session(python=py_versions[0])
@@ -153,7 +154,9 @@ def blacken_nb(session):
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def unit_tests(session):
     session.install("-r", "docker/requirements.txt", *pytest_deps)
-    session.run("pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/", env={"METATLAS_LOCAL": "TRUE"})
+    session.run(
+        "pytest", "-vv", *session.posargs, "--cov", "metatlas", "tests/unit/", env={"METATLAS_LOCAL": "TRUE"}
+    )
 
 
 @nox.session(python=py_versions[0], reuse_venv=REUSE_LARGE_VENV)
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
new file mode 100644
index 00000000..f0bdd3be
--- /dev/null
+++ b/tests/system/test_rt_predict.py
@@ -0,0 +1,225 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+
+import os
+import subprocess
+
+
+def test_targeted_by_line01_with_remove(tmp_path):
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci02:v1.3.3"
+    experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
+    out_files = {}
+    expected = {}
+    out_files["rt_model"] = tmp_path / experiment / "root0/data_QC/rt_model.txt"
+    expected[
+        "rt_model"
+    ] = """RANSACRegressor(random_state=42)
+Linear model with intercept=-0.004 and slope=0.99798
+groups = 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_root0_QC, 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_QC
+atlas = HILICz150_ANT20190824_TPL_QCv3_Unlab_POS
+
+LinearRegression()
+Polynomial model with intercept=0.097 and coefficents=[0.00000, 0.96116, 0.00213]
+groups = 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_root0_QC, 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_QC
+atlas = HILICz150_ANT20190824_TPL_QCv3_Unlab_POS
+"""
+
+    out_files["RT_Predicted_Model_Comparison"] = (
+        tmp_path / experiment / "root0/data_QC/RT_Predicted_Model_Comparison.csv"
+    )
+    expected[
+        "RT_Predicted_Model_Comparison"
+    ] = """,RT Measured,RT Reference,RT Linear Pred,RT Polynomial Pred,RT Diff Linear,RT Diff Polynomial
+0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.7757497429847717,1.068941733,1.062903572946303,1.1269157193507062,-0.28715382996153127,-0.3511659763659345
+0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2491384744644165,1.224396021,1.2180440647072988,1.277093230335882,0.031094409757117747,-0.027954755871465453
+0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.6553537845611572,1.255231064,1.248816864840618,1.3068938537656585,0.4065369197205393,0.34845993079549875
+0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.2450578808784485,1.272043637,1.265595500390302,1.323144126044538,-0.020537619511853622,-0.07808624516608953
+0004_2_4-dihydroxypteridine_unlabeled_positive_M+H165p0407_1p27,,1.272194658,1.2657462165429905,1.32329010157373,,
+0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.8842174410820007,1.393699506,1.3870057978165864,1.4407671152594659,0.49721164326541434,0.44345032582253485
+0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.6583104729652405,1.603927488,1.596809418733121,1.6441748822425457,0.06150105423211949,0.014135590722694769
+0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.3172867894172668,1.616534167,1.6093906501979809,1.656378567437273,-0.29210386078071404,-0.33909177802000623
+0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.9361981749534607,1.876662419,1.8689938136450273,1.9083427073174741,0.06720436130843344,0.027855467635986564
+0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.8230673670768738,1.9552864,1.9474590861922034,1.9845558887665662,-0.12439171911532965,-0.1614885216896924
+0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.1844738721847534,2.158057096,2.149820474204603,2.181230854992953,0.03465339798015021,0.0032430171918003126
+0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,1.8282268047332764,2.196354854,2.1880409252832096,2.2183969913536457,-0.3598141205499332,-0.3901701866203693
+0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.293710947036743,2.234006981,2.2256170486168254,2.254942673466113,0.06809389841991775,0.038768273570630285
+0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.6646790504455566,2.557601998,2.548558864598047,2.56927787344364,0.11612018584750983,0.09540117700191653
+0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.7709169387817383,2.725344,2.7159622666788823,2.732395658981744,0.054954672102855984,0.03852127979999409
+0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.793001413345337,2.750702982,2.741270059655543,2.757066003852173,0.05173135368979409,0.035935409493164094
+0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.048208475112915,2.796993087,2.787466724577183,2.802106149909774,0.2607417505357321,0.2461023252031409
+0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.920204997062683,2.888931519,2.8792195718979,2.8915891009061414,0.040985425164782985,0.028615896156541698
+0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.1090638637542725,3.091018609,3.0808987338207197,3.088405611875559,0.0281651299335528,0.020658251878713507
+0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.1437186002731323,3.102967341,3.092823346401367,3.1000481672384095,0.05089525387176552,0.043670433034722844
+0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.466769218444824,4.418371688,4.4055724502287985,4.385470549114132,0.06119676821602571,0.08129866933069252
+0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,3.9647421836853027,4.448129315,4.435270009207811,4.414635292629683,-0.4705278255225078,-0.44989310894438006
+0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.878586053848267,4.833663875,4.8200263385363336,4.792830070747619,0.05855971531193305,0.0857559831006478
+0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,5.1236891746521,5.064398962,5.050295669310254,5.019475738648949,0.07339350534184597,0.10421343600315058
+0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.435921669006348,5.434235961,5.4193861243530295,5.383231720701441,0.01653554465331819,0.052689948304906586
+0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.63983416557312,5.594117397,5.578944827580915,5.540664965694513,0.060889337992205306,0.09916919987860684
+0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.612481355667114,5.631626786,5.61637850104198,5.577615783318941,-0.0038971453748661844,0.034865572348173224
+0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,6.1643900871276855,5.878913512,5.8631660600806,5.8213702555437035,0.30122402704708584,0.3430198315839821
+0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.9536755084991455,6.87418691,6.856430423443528,6.8050649676035455,0.09724508505561769,0.1486105408956
+0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.825943946838379,6.933566273,6.915689924707814,6.863887108093793,-0.08974597786943495,-0.03794316125541375
+0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.6624157428741455,7.153497474,7.135177177942838,7.081885478010634,-0.4727614350686924,-0.4194697351364889
+0031_betaine_unlabeled_positive_M118p0863_7p91,8.0335111618042,7.905109179,7.8852716978638355,7.828449318591656,0.1482394639403637,0.20506184321254306
+0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.48360538482666,8.570944541,8.549763020821363,8.49182598096076,-0.0661576359947027,-0.008220596134099978
+0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.09137773513794,8.979305704,8.95729987592511,8.899614864870298,0.13407785921283022,0.19176287026764172
+0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.326712608337402,9.319656306,9.296963454490605,9.240032036011817,0.029749153846797327,0.08668057232558546
+0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.878215789794922,9.351932178,9.3291741752016,9.272339873977243,-0.450958385406679,-0.39412408418232125
+0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.116773128509521,9.534507075,9.511380530961908,9.455179262226952,-0.3946074024523867,-0.3384061337174309
+0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.326712608337402,9.70543744,9.681965860090859,9.626486134780636,-0.35525325175345657,-0.29977352644323396
+0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.507513999938965,9.782678891,9.759051393379414,9.703938612775616,-0.251537393440449,-0.19642461283665114
+0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,10.337260723114014,10.15664925,10.132266864922379,10.079290595303439,0.20499385819163507,0.2579701278105748
+0040_methionine_unlabeled_positive_M+H150p0583_10p44,10.456631660461426,10.4409554,10.41599912145348,10.365046300146744,0.04063253900794628,0.09158536031468145
+0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,11.042783260345459,10.78124768,10.755604495746342,10.707526942600817,0.2871787645991173,0.3352563177446424
+0042_proline_unlabeled_positive_M+H116p0706_10p92,10.849864959716797,10.91977168,10.893848874574285,10.847083243109427,-0.043983914857488315,0.0027817166073695887
+0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,10.991784572601318,10.97482181,10.948787881722849,10.902566344225118,0.04299669087846958,0.08921822837620041
+0044_valine_unlabeled_positive_M+H118p0863_11p12,11.041275024414062,11.11600911,11.089690184478288,11.044923184166127,-0.048415160064225304,-0.0036481597520641174
+0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,11.50427532196045,11.65330736,11.625903857319027,11.58744978041825,-0.1216285353585782,-0.08317445845780114
+0046_taurine_unlabeled_positive_M+H126p0219_12p16,12.075395107269287,12.15812344,12.129700927876891,12.098300243547898,-0.05430582060760436,-0.022905136278611238
+0047_ectoine_unlabeled_positive_M+H143p0815_12p50,12.36878776550293,12.50349732,12.474377644575776,12.448428911008959,-0.10558987907284667,-0.07964114550602908
+0048_carnitine_unlabeled_positive_M+H161p1046_13p29,13.466909408569336,13.28582682,13.25512795330405,13.243410597608959,0.21178145526528525,0.22349881096037727
+0049_alanine_unlabeled_positive_M+H90p0550_13p41,13.68701457977295,13.40509074,13.37415113006032,13.364832662828071,0.3128634497126299,0.32218191694487786
+0050_sucrose_unlabeled_positive_M+H343p1235_13p45,13.328831672668457,13.44515078,13.414130305839409,13.405631224802201,-0.08529863317095199,-0.07679955213374434
+0051_threonine_unlabeled_positive_M+H120p0655_13p49,13.459657192230225,13.48957226,13.458462117721956,13.45087963624058,0.0011950745082689451,0.008777555989643915
+0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,13.243738651275635,13.67383331,13.642351222854888,13.638660690201752,-0.3986125715792532,-0.39492203892611677
+0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,13.88631010055542,13.86132281,13.829462261117298,13.829880536300239,0.056847839438121994,0.056429564255180864
+0054_maltose_unlabeled_positive_M+Na365p1054_14p07,13.663294792175293,14.0677773,14.035500007112862,14.040616184637264,-0.3722052149375692,-0.3773213924619707
+0055_serine_unlabeled_positive_M+H106p0499_14p31,14.328335285186768,14.31261357,14.279842056582272,14.290765198503442,0.0484932286044959,0.037570086683325954
+0056_glutamine_unlabeled_positive_M+H147p0764_14p31,14.320549488067627,14.31275825,14.27998644453475,14.290913093540711,0.04056304353287743,0.02963639452691602
+0057_asparagine_unlabeled_positive_M+H133p0608_14p37,14.360477924346924,14.36808894,14.335205445351729,14.347479873380236,0.025272478995194803,0.012998050966688268
+0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,14.392436504364014,14.38565257,14.352733621836048,14.365438606423183,0.0397028825279655,0.02699789794083074
+0059_alpha-ketoglutaric_acid_unlabeled_positive_M+H147p0288_14p51,,14.50646265,14.473299837551405,14.4890020294111,,
+0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,14.69622278213501,14.52081396,14.487622178346628,14.503684552870762,0.20860060378838163,0.19253822926424746
+0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,14.559170722961426,14.53906337,14.505834750532143,14.522356409474206,0.05333597242928256,0.03681431348721986
+0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,14.634858131408691,14.82464623,14.790841139927538,14.81473516440485,-0.15598300851884694,-0.1798770329961581
+0063_citrulline_unlabeled_positive_M+H176p1030_15p09,15.141581535339355,15.08943009,15.055090513677682,15.086130811048331,0.08649102166167388,0.055450724291024045
+0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,15.190487384796143,15.12986101,15.095439820807439,15.127597632861947,0.09504756398870384,0.0628897519341951
+0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,15.118823528289795,15.15757256,15.12309543294764,15.156023222818638,-0.004271904657844772,-0.03719969452884264
+0066_raffinose_unlabeled_positive_M+H505p1763_15p53,15.543988227844238,15.53249857,15.497264626436776,15.540931897821977,0.04672360140746257,0.0030563300222610223
+0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,16.006930351257324,15.93538957,15.899342360478306,15.955218576615934,0.10758799077901848,0.05171177464139021
+0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,16.24086856842041,16.13036002,16.093919247877274,16.15595235329566,0.14694932054313625,0.08491621512474978
+0069_arginine_unlabeled_positive_M+H175p1190_16p94,16.976414680480957,16.93991539,16.901840469127567,16.991172768800162,0.07457421135339004,-0.014758088319204887
+0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.048407554626465,17.01131041,16.973091372879324,17.06496535505383,0.07531618174714083,-0.0165578004273641
+0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.070573806762695,17.03725065,16.998979250542746,17.09178209803637,0.07159455621994937,-0.021208291273673296"""
+
+    out_files["QC_Measured_RTs"] = tmp_path / experiment / "root0/data_QC/QC_Measured_RTs.csv"
+    expected[
+        "QC_Measured_RTs"
+    ] = """,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5,mean,median,min,max,standard deviation,standard error,#NaNs
+0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.760883629322052,0.7883098125457764,0.7816191911697388,0.7698802947998047,0.775173231959343,0.7757497429847717,0.760883629322052,0.7883098125457764,0.012197377682005251,0.006098688841002626,0
+0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2340805530548096,1.2472213506698608,1.2510555982589722,1.2544312477111816,1.246697187423706,1.2491384744644165,1.2340805530548096,1.2544312477111816,0.008911895485740983,0.004455947742870492,0
+0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.6460527181625366,1.656285285949707,1.6621986627578735,1.6544222831726074,1.6547397375106812,1.6553537845611572,1.6460527181625366,1.6621986627578735,0.0066730644692519535,0.0033365322346259768,0
+0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.6460527181625366,0.772739052772522,1.7125025987625122,0.8440630435943604,1.2438393533229828,1.2450578808784485,0.772739052772522,1.7125025987625122,0.5043733469961971,0.25218667349809853,0
+0004_2_4-dihydroxypteridine_unlabeled_positive_M+H165p0407_1p27,,,,,,,,,,,0
+0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.8791532516479492,1.8892816305160522,1.878132700920105,1.8925062417984009,1.8847684562206268,1.8842174410820007,1.878132700920105,1.8925062417984009,0.00720661359839974,0.00360330679919987,0
+0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.6460527181625366,1.6672121286392212,1.6621986627578735,1.6544222831726074,1.6574714481830597,1.6583104729652405,1.6460527181625366,1.6672121286392212,0.009254048049870576,0.004627024024935288,0
+0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.4245437383651733,1.2100298404693604,1.6621986627578735,1.1872817277908325,1.37101349234581,1.3172867894172668,1.1872817277908325,1.6621986627578735,0.22160579729537735,0.11080289864768868,0
+0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.940125823020935,1.9492311477661133,1.910657525062561,1.9322705268859863,1.933071255683899,1.9361981749534607,1.910657525062561,1.9492311477661133,0.016471445184473014,0.008235722592236507,0
+0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.8015246391296387,1.7716032266616821,1.8446100950241089,1.8571667671203613,1.8187261819839478,1.8230673670768738,1.7716032266616821,1.8571667671203613,0.03942977527526356,0.01971488763763178,0
+0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.140230655670166,2.1597485542297363,2.2091991901397705,2.2232186794281006,2.1830992698669434,2.1844738721847534,2.140230655670166,2.2232186794281006,0.03947043984935922,0.01973521992467961,0
+0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,,1.8414007425308228,1.8282268047332764,1.78067147731781,1.816766341527303,1.8282268047332764,1.78067147731781,1.8414007425308228,0.03194554078427394,0.01844376657120873,0
+0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.2475903034210205,2.2757790088653564,2.322835922241211,2.31164288520813,2.2894620299339294,2.293710947036743,2.2475903034210205,2.322835922241211,0.03438155307316169,0.017190776536580844,0
+0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.593029737472534,2.6252453327178955,2.7041127681732178,2.7198987007141113,2.6605716347694397,2.6646790504455566,2.593029737472534,2.7198987007141113,0.06117021758664283,0.030585108793321415,0
+0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.747974395751953,2.775257110595703,2.7665767669677734,2.788285493850708,2.7695234417915344,2.7709169387817383,2.747974395751953,2.788285493850708,0.01691088242483735,0.008455441212418676,0
+0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.763174295425415,2.769817352294922,2.835042953491211,2.816185474395752,2.796055018901825,2.793001413345337,2.763174295425415,2.835042953491211,0.03509440319693131,0.017547201598465654,0
+0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.136334180831909,2.960082769393921,,,3.048208475112915,3.048208475112915,2.960082769393921,3.136334180831909,0.12462856822150174,0.08812570571899414,0
+0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.919912815093994,2.943791389465332,2.917609214782715,2.920497179031372,2.9254526495933533,2.920204997062683,2.917609214782715,2.943791389465332,0.012289227050439883,0.0061446135252199415,0
+0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.075451612472534,3.078127861022949,3.1469616889953613,3.1399998664855957,3.11013525724411,3.1090638637542725,3.075451612472534,3.1469616889953613,0.03862429972687937,0.019312149863439685,0
+0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.136334180831909,3.1404755115509033,3.1469616889953613,3.161953926086426,3.14643132686615,3.1437186002731323,3.136334180831909,3.161953926086426,0.011234715018600752,0.005617357509300376,0
+0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.396528244018555,4.399042129516602,4.534496307373047,4.545421600341797,4.4688720703125,4.466769218444824,4.396528244018555,4.545421600341797,0.08221155813284872,0.04110577906642436,0
+0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,,,,3.9647421836853027,3.9647421836853027,3.9647421836853027,3.9647421836853027,3.9647421836853027,,,0
+0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.836014747619629,4.8464555740356445,4.910716533660889,4.921308994293213,4.878623962402344,4.878586053848267,4.836014747619629,4.921308994293213,0.04359776550856559,0.021798882754282795,0
+0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,,5.1236891746521,,,5.1236891746521,5.1236891746521,5.1236891746521,5.1236891746521,,,0
+0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.432004451751709,5.439838886260986,5.39183235168457,5.441697120666504,5.426343202590942,5.435921669006348,5.39183235168457,5.441697120666504,0.023387495592980652,0.011693747796490326,0
+0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.601799011230469,5.607846260070801,5.6718220710754395,5.671968460083008,5.638358950614929,5.63983416557312,5.601799011230469,5.671968460083008,0.03880306263547012,0.01940153131773506,0
+0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.478054046630859,5.553140640258789,5.6718220710754395,5.700730800628662,5.6009368896484375,5.612481355667114,5.478054046630859,5.700730800628662,0.10387204978316382,0.05193602489158191,0
+0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,,6.089038848876953,6.223822593688965,6.1643900871276855,6.159083843231201,6.1643900871276855,6.089038848876953,6.223822593688965,0.06754836515123151,0.03899906680338265,0
+0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.925775527954102,6.942814826965332,6.964536190032959,6.967518329620361,6.9501612186431885,6.9536755084991455,6.925775527954102,6.967518329620361,0.019634497574413202,0.009817248787206601,0
+0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.8023223876953125,6.832950592041016,6.818937301635742,6.858341693878174,6.828137993812561,6.825943946838379,6.8023223876953125,6.858341693878174,0.023710214782490673,0.011855107391245337,0
+0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.656971454620361,6.657993316650391,6.6668381690979,6.696530342102051,6.669583320617676,6.6624157428741455,6.656971454620361,6.696530342102051,0.018502839238360106,0.009251419619180053,0
+0031_betaine_unlabeled_positive_M118p0863_7p91,7.914710998535156,7.898166656494141,8.152311325073242,8.210945129394531,8.044033527374268,8.0335111618042,7.898166656494141,8.210945129394531,0.1608156551761606,0.0804078275880803,0
+0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.49014663696289,8.50051498413086,8.451272010803223,8.47706413269043,8.47974944114685,8.48360538482666,8.451272010803223,8.50051498413086,0.021271924977460343,0.010635962488730171,0
+0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.065049171447754,9.064212799072266,9.132002830505371,9.117706298828125,9.094742774963379,9.09137773513794,9.064212799072266,9.132002830505371,0.03525821268363917,0.017629106341819585,0
+0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.267672538757324,9.295421600341797,9.358003616333008,9.375864028930664,9.324240446090698,9.326712608337402,9.267672538757324,9.375864028930664,0.051105772604218114,0.025552886302109057,0
+0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.894339561462402,8.865837097167969,8.890594482421875,8.856619834899902,8.876847743988037,8.878215789794922,8.856619834899902,8.894339561462402,0.018487285230656875,0.009243642615328437,0
+0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.065049171447754,9.164083480834961,9.148656845092773,9.08488941192627,9.11566972732544,9.116773128509521,9.065049171447754,9.164083480834961,0.0481037419227615,0.02405187096138075,0
+0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.267672538757324,9.295421600341797,9.358003616333008,9.375864028930664,9.324240446090698,9.326712608337402,9.267672538757324,9.375864028930664,0.051105772604218114,0.025552886302109057,0
+0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.526045799255371,9.48938274383545,9.510781288146973,9.504246711730957,9.507614135742188,9.507513999938965,9.48938274383545,9.526045799255371,0.015203949810787694,0.007601974905393847,0
+0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,10.33144474029541,10.327781677246094,10.347268104553223,10.343076705932617,10.337392807006836,10.337260723114014,10.327781677246094,10.347268104553223,0.009266094493898432,0.004633047246949216,0
+0040_methionine_unlabeled_positive_M+H150p0583_10p44,10.443852424621582,10.439347267150879,10.46941089630127,10.474287986755371,10.456724643707275,10.456631660461426,10.439347267150879,10.474287986755371,0.01767370234818388,0.00883685117409194,0
+0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,11.050666809082031,11.034899711608887,,,11.042783260345459,11.042783260345459,11.034899711608887,11.050666809082031,0.011149021542889777,0.007883548736572266,0
+0042_proline_unlabeled_positive_M+H116p0706_10p92,10.827990531921387,10.806012153625488,10.871739387512207,10.887017250061035,10.84818983078003,10.849864959716797,10.806012153625488,10.887017250061035,0.03763472791114847,0.018817363955574234,0
+0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,10.955867767333984,10.943641662597656,11.027701377868652,11.047686576843262,10.993724346160889,10.991784572601318,10.943641662597656,11.047686576843262,0.05166480677489114,0.02583240338744557,0
+0044_valine_unlabeled_positive_M+H118p0863_11p12,11.034863471984863,11.012262344360352,11.061145782470703,11.047686576843262,11.038989543914795,11.041275024414062,11.012262344360352,11.061145782470703,0.020799879441150262,0.010399939720575131,0
+0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,11.499093055725098,11.508075714111328,11.50047492980957,11.510473251342773,11.504529237747192,11.50427532196045,11.499093055725098,11.510473251342773,0.00559458905420339,0.002797294527101695,0
+0046_taurine_unlabeled_positive_M+H126p0219_12p16,12.104499816894531,12.101568222045898,12.045284271240234,12.049221992492676,12.075143575668335,12.075395107269287,12.045284271240234,12.104499816894531,0.03226741065106046,0.01613370532553023,0
+0047_ectoine_unlabeled_positive_M+H143p0815_12p50,12.293835639953613,12.278909683227539,12.455086708068848,12.443739891052246,12.367892980575562,12.36878776550293,12.278909683227539,12.455086708068848,0.09444225193198486,0.04722112596599243,0
+0048_carnitine_unlabeled_positive_M+H161p1046_13p29,,13.355096817016602,,13.57872200012207,13.466909408569336,13.466909408569336,13.355096817016602,13.57872200012207,0.15812688341796033,0.11181259155273438,0
+0049_alanine_unlabeled_positive_M+H90p0550_13p41,,13.68701457977295,,,13.68701457977295,13.68701457977295,13.68701457977295,13.68701457977295,,,0
+0050_sucrose_unlabeled_positive_M+H343p1235_13p45,,,13.34303092956543,13.314632415771484,13.328831672668457,13.328831672668457,13.314632415771484,13.34303092956543,0.02008078167931844,0.014199256896972656,0
+0051_threonine_unlabeled_positive_M+H120p0655_13p49,13.462236404418945,13.455792427062988,13.457077980041504,13.46524429321289,13.460087776184082,13.459657192230225,13.455792427062988,13.46524429321289,0.004423993974227501,0.0022119969871137505,0
+0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,13.227526664733887,13.222009658813477,13.259950637817383,13.277363777160645,13.246712684631348,13.243738651275635,13.222009658813477,13.277363777160645,0.026413858243097502,0.013206929121548751,0
+0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,13.855559349060059,13.86020565032959,13.917991638183594,13.91241455078125,13.886542797088623,13.88631010055542,13.855559349060059,13.917991638183594,0.03322647837942044,0.01661323918971022,0
+0054_maltose_unlabeled_positive_M+Na365p1054_14p07,13.650632858276367,13.675956726074219,13.638050079345703,13.978111267089844,13.735687732696533,13.663294792175293,13.638050079345703,13.978111267089844,0.16238268195258865,0.08119134097629432,0
+0055_serine_unlabeled_positive_M+H106p0499_14p31,14.32981014251709,14.33325481414795,14.326860427856445,14.324366569519043,14.328572988510132,14.328335285186768,14.324366569519043,14.33325481414795,0.0038330521372662756,0.0019165260686331378,0
+0056_glutamine_unlabeled_positive_M+H147p0764_14p31,14.314238548278809,14.306297302246094,14.326860427856445,14.336118698120117,14.320878744125366,14.320549488067627,14.306297302246094,14.336118698120117,0.013225573537406289,0.006612786768703144,0
+0057_asparagine_unlabeled_positive_M+H133p0608_14p37,14.36136245727539,14.355669021606445,14.359593391418457,14.37166690826416,14.362072944641113,14.360477924346924,14.355669021606445,14.37166690826416,0.006824156717503251,0.0034120783587516254,0
+0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,14.37686824798584,14.366856575012207,14.408004760742188,14.413251876831055,14.391245365142822,14.392436504364014,14.366856575012207,14.413251876831055,0.022852268153811535,0.011426134076905767,0
+0059_alpha-ketoglutaric_acid_unlabeled_positive_M+H147p0288_14p51,,,,,,,,,,,0
+0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,14.704928398132324,14.712645530700684,14.6831636428833,14.687517166137695,14.697063684463501,14.69622278213501,14.6831636428833,14.712645530700684,0.014011838106020863,0.007005919053010431,0
+0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,14.579874992370605,14.584083557128906,14.537906646728516,14.538466453552246,14.560082912445068,14.559170722961426,14.537906646728516,14.584083557128906,0.025343082088378755,0.012671541044189378,0
+0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,,14.561990737915039,14.634858131408691,14.699416160583496,14.632088343302408,14.634858131408691,14.561990737915039,14.699416160583496,0.06875456707387731,0.03969546780811925,0
+0063_citrulline_unlabeled_positive_M+H176p1030_15p09,15.128581047058105,15.128291130065918,15.154582023620605,15.162788391113281,15.143560647964478,15.141581535339355,15.128291130065918,15.162788391113281,0.017783170397114904,0.008891585198557452,0
+0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,15.159500122070312,15.161605834960938,15.219368934631348,15.223294258117676,15.190942287445068,15.190487384796143,15.159500122070312,15.223294258117676,0.03513764118653772,0.01756882059326886,0
+0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,15.144072532653809,15.128291130065918,15.105835914611816,15.109355926513672,15.121888875961304,15.118823528289795,15.105835914611816,15.144072532653809,0.017775225003882934,0.008887612501941467,0
+0066_raffinose_unlabeled_positive_M+H505p1763_15p53,15.555009841918945,15.555731773376465,15.516404151916504,15.532966613769531,15.540028095245361,15.543988227844238,15.516404151916504,15.555731773376465,0.018964998104185705,0.009482499052092853,0
+0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,16.00358009338379,16.012876510620117,16.00017547607422,16.01028060913086,16.006728172302246,16.006930351257324,16.00017547607422,16.012876510620117,0.005867142979506786,0.002933571489753393,0
+0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,16.245359420776367,16.244237899780273,16.233556747436523,16.237499237060547,16.240163326263428,16.24086856842041,16.233556747436523,16.245359420776367,0.00560790521224537,0.002803952606122685,0
+0069_arginine_unlabeled_positive_M+H175p1190_16p94,16.963918685913086,16.961685180664062,16.988910675048828,16.9918212890625,16.97658395767212,16.976414680480957,16.961685180664062,16.9918212890625,0.01598443924833885,0.007992219624169425,0
+0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.043212890625,17.035064697265625,17.05360221862793,17.05995750427246,17.047959327697754,17.048407554626465,17.035064697265625,17.05995750427246,0.011024194876641502,0.005512097438320751,0
+0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.058874130249023,17.06319236755371,17.085508346557617,17.07795524597168,17.071382522583008,17.070573806762695,17.058874130249023,17.085508346557617,0.0124669979531353,0.00623349897656765,0"""
+
+    subprocess.run(
+        [
+            "docker",
+            "run",
+            "--rm",
+            "-v",
+            f"{os.getcwd()}:/src",
+            "-v",
+            f"{tmp_path}:/out",
+            image,
+            "/bin/bash",
+            "-c",
+            """\
+                    jq -M '(.cells[] | select(.source[] | contains("predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)")).source) \
+                                = ["predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path, model_only=True)"]' \
+                                /src/notebooks/reference/RT_Prediction.ipynb > /out/Remove.ipynb &&  \
+                    papermill \
+                        -p source_atlas HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0 \
+                        -p experiment 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583 \
+                        -p metatlas_repo_path /src \
+                        -p project_directory /out \
+                        -p max_cpus 2 \
+                        /out/Remove.ipynb \
+                        /out/Remove-done.ipynb
+                   """,
+        ],
+        check=True,
+    )
+    files = subprocess.check_output(f"find {str(tmp_path)} -type f", shell=True, text=True).strip()
+    print(files)
+    num_files_created = int(
+        subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
+    )
+    for _, path in out_files.items():
+        os.system(f"cat {path}")
+    assert num_files_created == 8
+    for metric_name, path in out_files.items():
+        with open(path, "r") as handle:
+            expected_lines = expected[metric_name].split("\n")
+            num = None
+            for num, line in enumerate(handle.readlines()):
+                clean_line = line.rstrip("\n")
+                assert expected_lines[num] == clean_line
+            assert len(expected_lines) == num + 1

From 4d69b2a9f8499ac8e4bfa7de0c17fbe9d1aeefec Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 19 Aug 2021 16:50:32 -0700
Subject: [PATCH 089/177] fix rclone shell calls

---
 metatlas/io/rclone.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
index 65385945..639c0f8d 100644
--- a/metatlas/io/rclone.py
+++ b/metatlas/io/rclone.py
@@ -17,7 +17,7 @@ def __init__(self, rclone_path):
     def config_file(self):
         """Returns path to config file or None"""
         try:
-            result = subprocess.check_output(f"{self.rclone_path} config file", text=True)
+            result = subprocess.check_output([self.rclone_path, "config", "file"], text=True)
         except (subprocess.CalledProcessError, FileNotFoundError):
             return None
         return result.split("\n")[1]
@@ -50,7 +50,7 @@ def copy_to_drive(self, source, drive, dest_path=None):
         """
         dest = f"{drive}:" if dest_path is None else f"{drive}:{dest_path}"
         try:
-            subprocess.check_output(f"{self.rclone_path} copy {source} {dest}", text=True)
+            subprocess.check_output([self.rclone_path, "copy", source, dest], text=True)
         except subprocess.CalledProcessError as err:
             logger.exception(err)
             raise err

From 63e93bf5588063a92b6747b661ceb74778fb200f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 19 Aug 2021 20:54:01 -0700
Subject: [PATCH 090/177] add google_folder to notebook parameters

---
 notebooks/reference/RT_Prediction.ipynb | 4 ++++
 notebooks/reference/Targeted.ipynb      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 812da8eb..202291d6 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -54,6 +54,10 @@
     "# your data being purged. Each project will take on the order of 100 MB.\n",
     "project_directory = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects\"\n",
     "\n",
+    "# ID from Google Drive URL for base output folder .\n",
+    "# The default value is the ID that corresponds to 'JGI_Metabolomics_Projects'.\n",
+    "google_folder = \"0B-ZDcHbPi-aqZzE5V3hOZFc0dms\"\n",
+    "\n",
     "# maximum number of CPUs to use\n",
     "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
     "max_cpus = 4\n",
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 2929e227..b6b86699 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -88,6 +88,10 @@
     "# your data being purged. Each project will take on the order of 100 MB.\n",
     "project_directory = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects\"\n",
     "\n",
+    "# ID from Google Drive URL for base output folder .\n",
+    "# The default value is the ID that corresponds to 'JGI_Metabolomics_Projects'.\n",
+    "google_folder = \"0B-ZDcHbPi-aqZzE5V3hOZFc0dms\"\n",
+    "\n",
     "# maximum number of CPUs to use\n",
     "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
     "max_cpus = 4\n",

From 3d0d14bdbfc86a749aad14112642c160f233fe63 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 20 Aug 2021 20:02:10 -0700
Subject: [PATCH 091/177] Add google_folder parameter

---
 metatlas/datastructures/metatlas_dataset.py | 9 +++++----
 metatlas/tools/predict_rt.py                | 7 ++++---
 notebooks/reference/RT_Prediction.ipynb     | 8 ++++----
 notebooks/reference/Targeted.ipynb          | 7 ++++---
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 0ff66e31..a7bf783b 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -21,7 +21,7 @@
 from metatlas.tools import parallel
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
-DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "InjBL", "ISTD"]
+DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "ISTD"]  # these are case insensitive
 OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
 SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
@@ -39,6 +39,7 @@ class AnalysisIdentifiers(HasTraits):
     analysis_number = Int(default_value=0)
     username = Unicode(default_value=getpass.getuser())
     project_directory = Unicode()
+    google_folder = Unicode()
     exclude_files = List(trait=Unicode(), allow_none=True, default_value=[])
     groups_controlled_vocab = List(
         trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
@@ -293,11 +294,11 @@ def existing_groups(self):
 
     def group_name(self, base_filename):
         """Returns dict with keys group and short_name corresponding to base_filename"""
-        indices = [
-            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in base_filename.lower()
-        ]
         tokens = base_filename.split("_")
         prefix = "_".join(tokens[:11])
+        indices = [
+            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in tokens[12].lower()
+        ]
         suffix = self.groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
         group_name = f"{prefix}_{self.analysis}_{suffix}"
         short_name = f"{tokens[9]}_{suffix}"  # Prepending POL to short_name
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 02b3d058..44cc2346 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -488,21 +488,22 @@ def write_notebooks(ids, atlases, repo_dir, use_poly_model):
         notebook.create_notebook(source, dest, parameters)
 
 
-def get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number=0, polarity="positive"):
+def get_analysis_ids_for_rt_prediction(experiment, project_directory, google_folder, analysis_number=0, polarity="positive"):
     """
     Simplified interface for generating an AnalysisIds instance for use in rt prediction
     inputs:
         experiment: name of experiment as given in LCMS run names
         project_directory: directory where per-experiment output directory will be created
+        google_folder: id from URL of base export folder on Google Drive
         analysis_number: integer, defaults to 0, increment if redoing analysis
         polarity: defaults to 'positive', set to 'negative' if you only have neg mode data
     Returns an AnalysisIds instance
     """
-    ids = mads.AnalysisIdentifiers(
+    return mads.AnalysisIdentifiers(
         experiment=experiment,
         output_type="data_QC",
         polarity=polarity,
         analysis_number=analysis_number,
         project_directory=project_directory,
+        google_folder=google_folder,
     )
-    return ids
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 202291d6..3184e04e 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -116,7 +116,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ids = predict_rt.get_analysis_ids_for_rt_prediction(experiment, project_directory, analysis_number)"
+    "ids = predict_rt.get_analysis_ids_for_rt_prediction(experiment, project_directory, google_folder, analysis_number)"
    ]
   },
   {
@@ -132,9 +132,9 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+    "display_name": "Metatlas Targeted",
+    "language": "python",
+    "name": "metatlas-targeted"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index b6b86699..8676fdcb 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -167,6 +167,7 @@
     "    polarity=polarity,\n",
     "    analysis_number=analysis_number,\n",
     "    project_directory=project_directory,\n",
+    "    google_folder=google_folder,\n",
     "    groups_controlled_vocab=groups_controlled_vocab,\n",
     "    exclude_files=exclude_files,\n",
     ")"
@@ -231,9 +232,9 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+    "display_name": "Metatlas Targeted",
+    "language": "python",
+    "name": "metatlas-targeted"
   },
   "language_info": {
    "codemirror_mode": {

From 07acce982bb848cc6aa4e799e5aa95abcee2cfa3 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 23 Aug 2021 10:38:51 -0700
Subject: [PATCH 092/177] improved group handling

revert group_controlled_vocab to search on whole file name
---
 metatlas/datastructures/metatlas_dataset.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index a7bf783b..4a518a34 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -21,7 +21,8 @@
 from metatlas.tools import parallel
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
-DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "ISTD"]  # these are case insensitive
+DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "ISTD"]
+DEFAULT_EXCLUDE_GROUPS = ["InjBl", "InjBL", "QC"]
 OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
 SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
@@ -41,11 +42,11 @@ class AnalysisIdentifiers(HasTraits):
     project_directory = Unicode()
     google_folder = Unicode()
     exclude_files = List(trait=Unicode(), allow_none=True, default_value=[])
+    include_groups = List(allow_none=True, default_value=None)
+    exclude_groups = List(allow_none=True, default_value=DEFAULT_EXCLUDE_GROUPS)
     groups_controlled_vocab = List(
         trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
     )
-    include_groups = List(allow_none=True, default_value=None)
-    exclude_groups = List(allow_none=True, default_value=["InjBl", "InjBL"])
     _lcmsruns = List(allow_none=True, default_value=None)
     _all_groups = List(allow_none=True, default_value=None)
     _groups = List(allow_none=True, default_value=None)
@@ -297,7 +298,7 @@ def group_name(self, base_filename):
         tokens = base_filename.split("_")
         prefix = "_".join(tokens[:11])
         indices = [
-            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in tokens[12].lower()
+            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in base_filename.lower()
         ]
         suffix = self.groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
         group_name = f"{prefix}_{self.analysis}_{suffix}"

From 089460c05dd8c8731e452c9a21f7329dd0b86ee5 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 23 Aug 2021 11:42:40 -0700
Subject: [PATCH 093/177] fix cloning of source_atlas

---
 metatlas/datastructures/metatlas_dataset.py | 22 ++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 4a518a34..38cdaea1 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -429,7 +429,12 @@ def write_data_source_files(self):
             )
 
     def _get_atlas(self):
-        """Copy source atlas from database into current analysis atlas"""
+        """
+        Copy source atlas from database into current analysis atlas
+        If the atlas does not yet exist, it will be copied from source_atlas and there will be an
+        an additional side effect that all mz_tolerances in the resulting atlas
+        get their value from source_atlas' atlas.compound_identifications[0].mz_references[0].mz_tolerance
+        """
         atlases = metob.retrieve("Atlas", name=self.ids.atlas, username=self.ids.username)
         if len(atlases) == 1:
             logger.warning(
@@ -454,11 +459,18 @@ def _get_atlas(self):
                 raise err
         else:
             logger.info("Retriving source atlas: %s", self.ids.source_atlas)
-            source = get_atlas(self.ids.source_atlas, self.ids.username)
+            source_atlas = get_atlas(self.ids.source_atlas, self.ids.username)
+            source_atlas_df = ma_data.make_atlas_df(source_atlas)
             logger.info("Cloning source atlas")
-            self.atlas = source.clone()
-            self.atlas.name = self.ids.atlas
-            self.store_atlas()
+            self.atlas = dp.make_atlas_from_spreadsheet(
+                source_atlas_df,
+                self.ids.atlas,
+                filetype="dataframe",
+                sheetname="",
+                polarity=self.ids.polarity,
+                store=True,
+                mz_tolerance=source_atlas.compound_identifications[0].mz_references[0].mz_tolerance,
+            )
 
     def _build(self):
         """Populate self._data from database and h5 files."""

From ff9b4c0d4b9e2189f2371a188bec4af9070714cb Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 23 Aug 2021 11:43:10 -0700
Subject: [PATCH 094/177] format with blacken

---
 metatlas/tools/predict_rt.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 44cc2346..10db8949 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -488,7 +488,9 @@ def write_notebooks(ids, atlases, repo_dir, use_poly_model):
         notebook.create_notebook(source, dest, parameters)
 
 
-def get_analysis_ids_for_rt_prediction(experiment, project_directory, google_folder, analysis_number=0, polarity="positive"):
+def get_analysis_ids_for_rt_prediction(
+    experiment, project_directory, google_folder, analysis_number=0, polarity="positive"
+):
     """
     Simplified interface for generating an AnalysisIds instance for use in rt prediction
     inputs:

From 4f386306ca67d1c95989cc4f259a1626bce1fa1d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 11:47:19 -0700
Subject: [PATCH 095/177] Add Metatlas Targeted kernel to ci images

---
 docker/Dockerfile.ci01 |  3 +++
 docker/Dockerfile.ci02 |  7 +++++--
 docker/kernel.json     | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 docker/kernel.json

diff --git a/docker/Dockerfile.ci01 b/docker/Dockerfile.ci01
index d42d2450..a14598f8 100644
--- a/docker/Dockerfile.ci01
+++ b/docker/Dockerfile.ci01
@@ -29,6 +29,9 @@ ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG
 
 ADD $BASE_DATA_URL/meta_atlas.sqlite3 /work/root_workspace.db
 
+RUN mkdir -p /root/.local/share/jupyter/kernels/metatlas-targeted
+COPY kernel.json /root/.local/share/jupyter/kernels/metatlas-targeted/kernel.json
+
 WORKDIR /work
 
 RUN apt-get update && apt-get install -y libxrender1 && \
diff --git a/docker/Dockerfile.ci02 b/docker/Dockerfile.ci02
index 387b4a08..c3c3ed52 100644
--- a/docker/Dockerfile.ci02
+++ b/docker/Dockerfile.ci02
@@ -13,8 +13,8 @@ EXPOSE 8888
 RUN apt-get update && apt-get install -y libxrender1 && \
     rm -rf /var/lib/apt/lists/*
 
- ADD https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 /usr/local/bin/jq
- RUN chmod +x /usr/local/bin/jq
+ADD https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 /usr/local/bin/jq
+RUN chmod +x /usr/local/bin/jq
 
 RUN mkdir -p /io /src /work $REFS_DIR $H5_DIR
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5 $H5_DIR/
@@ -29,6 +29,9 @@ RUN pip install --quiet -r requirements.txt
 
 ADD $BASE_DATA_URL/meta_atlas_rt_predict.sqlite3 /work/root_workspace.db
 
+RUN mkdir -p /root/.local/share/jupyter/kernels/metatlas-targeted
+COPY kernel.json /root/.local/share/jupyter/kernels/metatlas-targeted/kernel.json
+
 WORKDIR /work
 
 CMD ["/usr/local/bin/jupyter", "nbclassic", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/docker/kernel.json b/docker/kernel.json
new file mode 100644
index 00000000..fbc60568
--- /dev/null
+++ b/docker/kernel.json
@@ -0,0 +1,14 @@
+{
+   "argv": [
+	"/usr/local/bin/python",
+        "-m",
+        "IPython.kernel",
+        "-f",
+        "{connection_file}"
+   ],
+   "env": {
+   	"PATH": "/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+   },
+   "display_name": "Metatlas Targeted",
+   "language": "python"
+}

From 9b549ce986e397a673764ca9fa95df00923f17be Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 11:53:45 -0700
Subject: [PATCH 096/177] disable duplicate code checks

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3bc7895d..8793bcf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,10 @@
 line-length = 110
 
 [tool.pylint.messages_control]
-disable = "C0330, C0326"
+# first two are for black compatibility
+# duplicate-code cannot be disabled on per file/block/line
+#                https://github.com/PyCQA/pylint/issues/214
+disable = "bad-continuation, bad-whitespace, duplicate-code"
 
 [tool.pylint.format]
 max-line-length = "110"

From b16e2fca91ac786b77189680be3ed2dbf415cc32 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 11:57:35 -0700
Subject: [PATCH 097/177] terminate RT predict if no files match

---
 metatlas/tools/predict_rt.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 10db8949..90a797b7 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -110,6 +110,9 @@ def generate_rt_correction_models(
     files_df = get_files_df(groups)
     qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
+    if len(metatlas_dataset) == 0:
+        logger.error("No matching LCMS runs, terminating without generating outputs.")
+        return
     save_measured_rts(metatlas_dataset, os.path.join(ids.output_dir, "QC_Measured_RTs.csv"))
     rts_df = get_rts(metatlas_dataset)
     compound_atlas_rts_file_name = os.path.join(ids.output_dir, "Compound_Atlas_RTs.pdf")

From b7147f63b7eaf4d62d4ad02e9e594bba44cff4e7 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 12:05:19 -0700
Subject: [PATCH 098/177] add unique variables for easier debugging

---
 metatlas/plots/dill2plots.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 916b292a..84058542 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1453,12 +1453,8 @@ def make_output_dataframe(input_fname='', input_dataset=None, include_lcmsruns=N
     """
     fieldname can be: peak_height, peak_area, mz_centroid, rt_centroid, mz_peak, rt_peak
     """
-    if not input_dataset:
-        data = ma_data.get_dill_data(os.path.expandvars(input_fname))
-    else:
-        data = input_dataset
-    data = filter_runs(data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
-
+    full_data = or_default(input_dataset, ma_data.get_dill_data(os.path.expandvars(input_fname)))
+    data = filter_runs(full_data, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
     compound_names = ma_data.get_compound_names(data, use_labels=use_labels)[0]
     file_names = ma_data.get_file_names(data)
     group_names = ma_data.get_group_names(data)
@@ -3314,3 +3310,12 @@ def rt_range_overlaps(rt1, rt2):
     """
     return ((rt2.rt_min <= rt1.rt_min <= rt2.rt_max) or (rt2.rt_min <= rt1.rt_max <= rt2.rt_max) or
             (rt1.rt_min <= rt2.rt_min <= rt1.rt_max) or (rt1.rt_min <= rt2.rt_max <= rt1.rt_max))
+
+
+def or_default(none_or_value, default):
+    """
+    inputs:
+        none_or_value: variable to test
+        default: value to return if none_or_value is None
+    """
+    return none_or_value if none_or_value is not None else default

From 12418cbfd70de449119305acb34fd9fd494046ef Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 12:06:36 -0700
Subject: [PATCH 099/177] gracefully handle empty metatlas_dataset

---
 metatlas/io/metatlas_get_data_helper_fun.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 4d48ec7c..06d575ae 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -738,14 +738,13 @@ def get_compound_names(data,use_labels=False):
     Returns a tuple of lists containing the compound names and compound objects present in the dill file
     -------
     """
-
     # if data is a string then it's a file name - get its data
     if isinstance(data, six.string_types):
         data = get_dill_data(data)
-
     compound_names = list()
     compound_objects = list()
-
+    if len(data) == 0:
+        return (compound_names, compound_objects)
     for i,d in enumerate(data[0]):
         compound_objects.append(d['identification'])
         if use_labels:
@@ -760,20 +759,15 @@ def get_compound_names(data,use_labels=False):
                 d['identification'].mz_references[0].adduct,d['identification'].mz_references[0].mz,
                 d['identification'].rt_references[0].rt_peak)
         newstr = re.sub(r'\.', 'p', newstr)  # 2 or more in regexp
-
         newstr = re.sub(r'[\[\]]', '', newstr)
         newstr = re.sub('[^A-Za-z0-9+-]+', '_', newstr)
         newstr = re.sub('i_[A-Za-z]+_i_', '', newstr)
-        if newstr[0] == '_':
-            newstr = newstr[1:]
-        if newstr[0] == '-':
+        if newstr[0] in ['_', '-']:
             newstr = newstr[1:]
         if newstr[-1] == '_':
             newstr = newstr[:-1]
-
         newstr = re.sub('[^A-Za-z0-9]{2,}', '', newstr) #2 or more in regexp
         compound_names.append(newstr)
-
     # If duplicate compound names exist, then append them with a number
     D = defaultdict(list)
     for i,item in enumerate(compound_names):
@@ -782,7 +776,6 @@ def get_compound_names(data,use_labels=False):
     for k in D.keys():
         for i,f in enumerate(D[k]):
             compound_names[f] = '%s%d'%(compound_names[f],i)
-
     return (compound_names, compound_objects)
 
 

From 9878187dfc66d835b61996c22d1f80552155c717 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 12:16:39 -0700
Subject: [PATCH 100/177] refacter system tests

---
 tests/system/test_targeted.py | 94 +++++++++++++----------------------
 tests/system/utils.py         | 48 ++++++++++++++++++
 2 files changed, 82 insertions(+), 60 deletions(-)
 create mode 100644 tests/system/utils.py

diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index cc9045a2..c9892908 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -1,54 +1,41 @@
-# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long, duplicate-code
 
-import os
-import subprocess
+from . import utils
 
 
 def test_targeted_by_line01_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.0"
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.2"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    out_files = {}
     expected = {}
-    out_files["peak_height"] = (
-        tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab"
+    expected[
+        str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab")
+    ] = "\n".join(
+        [
+            f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
+            f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
+            "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
+            "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
+            "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+            "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+            "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
+            "0001_adenine_positive_M+H136p0618_2p52\t1594753.875\t12096485.0\t51774956.0\t91955488.0",
+            "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
+        ]
     )
-    expected["peak_height"] = [
-        f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
-        f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
-        "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
-        "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
-        "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
-        "0001_adenine_positive_M+H136p0618_2p52\t1594753.875\t12096485.0\t51774956.0\t91955488.0",
-        "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
-        "",
-    ]
-    out_files["rt_peak"] = tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab"
-    expected["rt_peak"] = [
-        f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
-        f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
-        "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
-        "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
-        "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-        "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t2.2775044441223145\t2.2806363105773926\t2.2833268642425537\t2.2922415733337402",
-        "0001_adenine_positive_M+H136p0618_2p52\t2.6164748668670654\t2.639369249343872\t2.6182913780212402\t2.657374620437622",
-        "0002_adenosine_positive_M+H268p1041_3p02\t3.098848819732666\t3.1250929832458496\t3.1176068782806396\t3.139331817626953",
-    ]
-    subprocess.run(
+    expected[str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab")] = "\n".join(
         [
-            "docker",
-            "run",
-            "--rm",
-            "-v",
-            f"{os.getcwd()}:/src",
-            "-v",
-            f"{tmp_path}:/out",
-            image,
-            "/bin/bash",
-            "-c",
-            """\
+            f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
+            f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
+            "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
+            "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
+            "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+            "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
+            "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t2.2775044441223145\t2.2806363105773926\t2.2833268642425537\t2.2922415733337402",
+            "0001_adenine_positive_M+H136p0618_2p52\t2.6164748668670654\t2.639369249343872\t2.6182913780212402\t2.657374620437622",
+            "0002_adenosine_positive_M+H268p1041_3p02\t3.098848819732666\t3.1250929832458496\t3.1176068782806396\t3.139331817626953",
+        ]
+    )
+    command = """\
                     jq -M '(.cells[] | select(.source[] | contains("compound_idx=0")).source) \
                                += ["\\n", \
                                    "agui.compound_idx = 0\\n", \
@@ -74,20 +61,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
                         -p max_cpus 2 \
                         /out/Remove.ipynb \
                         /out/Remove-done.ipynb
-                   """,
-        ],
-        check=True,
-    )
-    files = subprocess.check_output(f"find {str(tmp_path)} -type f", shell=True, text=True).strip()
-    print(files)
-    num_files_created = int(
-        subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
-    )
-    for _, path in out_files.items():
-        os.system(f"cat {path}")
-    assert num_files_created == 39
-    for metric_name, path in out_files.items():
-        with open(path, "r") as handle:
-            for num, line in enumerate(handle.readlines()):
-                clean_line = line.rstrip("\n")
-                assert expected[metric_name][num] == clean_line
+                   """
+    utils.exec_docker(image, command, tmp_path)
+    assert utils.num_files_in(tmp_path) == 39
+    utils.assert_files_match(expected)
diff --git a/tests/system/utils.py b/tests/system/utils.py
new file mode 100644
index 00000000..e66d2a9c
--- /dev/null
+++ b/tests/system/utils.py
@@ -0,0 +1,48 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring, duplicate-code
+
+import os
+import subprocess
+
+
+def num_files_in(path):
+    """Returns number of files in path. Does not count directories"""
+    return int(subprocess.check_output(f"find {str(path)} -type f | wc -l", shell=True, text=True).strip())
+
+
+def assert_files_match(expected):
+    """
+    Throw assertion error if expected does not contain the same data as files on disk
+    inputs:
+        expected: dict with Path objects as keys and strings representing file contents as values
+    returns None
+    """
+    for path, contents in expected.items():
+        with open(path, "r") as handle:
+            expected_lines = contents.split("\n")
+            num = None
+            for num, line in enumerate(handle.readlines()):
+                clean_line = line.rstrip("\n")
+                assert expected_lines[num] == clean_line
+            if num is None and contents == "":
+                continue
+            assert len(expected_lines) == num + 1
+
+
+def exec_docker(image, command, out_path):
+    """execute command in image with out_path mounted at /out"""
+    subprocess.run(
+        [
+            "docker",
+            "run",
+            "--rm",
+            "-v",
+            f"{os.getcwd()}:/src",
+            "-v",
+            f"{out_path}:/out",
+            image,
+            "/bin/bash",
+            "-c",
+            command,
+        ],
+        check=True,
+    )

From 5c524b820894486932dbe8cc04fdc9a72cc2982c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 12:17:40 -0700
Subject: [PATCH 101/177] add pylint disable block

---
 tests/unit/test_metatlas_get_data_helper_fun.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index 65c5baf4..e5d5000b 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -374,6 +374,7 @@ def test_get_data_for_atlas_and_lcmsrun(atlas_df, df_container):
 
 
 def test_get_data_for_atlas_df_and_file(lcmsrun, group, atlas_df, atlas, username):
+    # pylint: disable=line-too-long
     result = gdhf.get_data_for_atlas_df_and_file((lcmsrun.hdf5_file, group, atlas_df, atlas))
     expected = (
         {

From 37f768454b55b6b6d68cc3ec6933439472c60116 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 12:18:57 -0700
Subject: [PATCH 102/177] fixes to group handling

---
 metatlas/datastructures/metatlas_dataset.py | 14 ++++-
 tests/system/test_rt_predict.py             | 59 ++++-----------------
 tests/unit/test_predict_rt.py               | 26 +--------
 3 files changed, 25 insertions(+), 74 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 38cdaea1..3f90d3b8 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -22,7 +22,6 @@
 
 MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
 DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "ISTD"]
-DEFAULT_EXCLUDE_GROUPS = ["InjBl", "InjBL", "QC"]
 OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
 POLARITIES = ["positive", "negative", "fast-polarity-switching"]
 SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
@@ -43,7 +42,7 @@ class AnalysisIdentifiers(HasTraits):
     google_folder = Unicode()
     exclude_files = List(trait=Unicode(), allow_none=True, default_value=[])
     include_groups = List(allow_none=True, default_value=None)
-    exclude_groups = List(allow_none=True, default_value=DEFAULT_EXCLUDE_GROUPS)
+    exclude_groups = List(allow_none=True, default_value=None)
     groups_controlled_vocab = List(
         trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
     )
@@ -73,6 +72,17 @@ def _default_include_groups(self):
             return ["QC"]
         return []
 
+    @default("exclude_groups")
+    def _default_exclude_groups(self):
+        out = ["InjBl", "InjBL"]
+        if self.output_type != "data_QC":
+            out.append("QC")
+        if self.polarity == "positive":
+            out.append("NEG")
+        elif self.polarity == "negative":
+            out.append("POS")
+        return out
+
     @validate("polarity")
     def _valid_polarity(self, proposal):
         if proposal["value"] not in POLARITIES:
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index f0bdd3be..b4b9cc9f 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -1,17 +1,14 @@
-# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long, duplicate-code
 
-import os
-import subprocess
+from . import utils
 
 
 def test_targeted_by_line01_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci02:v1.3.3"
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci02:v1.3.5"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
-    out_files = {}
     expected = {}
-    out_files["rt_model"] = tmp_path / experiment / "root0/data_QC/rt_model.txt"
     expected[
-        "rt_model"
+        str(tmp_path / experiment / "root0/data_QC/rt_model.txt")
     ] = """RANSACRegressor(random_state=42)
 Linear model with intercept=-0.004 and slope=0.99798
 groups = 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_root0_QC, 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_QC
@@ -22,12 +19,8 @@ def test_targeted_by_line01_with_remove(tmp_path):
 groups = 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_root0_QC, 20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_QC
 atlas = HILICz150_ANT20190824_TPL_QCv3_Unlab_POS
 """
-
-    out_files["RT_Predicted_Model_Comparison"] = (
-        tmp_path / experiment / "root0/data_QC/RT_Predicted_Model_Comparison.csv"
-    )
     expected[
-        "RT_Predicted_Model_Comparison"
+        str(tmp_path / experiment / "root0/data_QC/RT_Predicted_Model_Comparison.csv")
     ] = """,RT Measured,RT Reference,RT Linear Pred,RT Polynomial Pred,RT Diff Linear,RT Diff Polynomial
 0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.7757497429847717,1.068941733,1.062903572946303,1.1269157193507062,-0.28715382996153127,-0.3511659763659345
 0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2491384744644165,1.224396021,1.2180440647072988,1.277093230335882,0.031094409757117747,-0.027954755871465453
@@ -102,9 +95,8 @@ def test_targeted_by_line01_with_remove(tmp_path):
 0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.048407554626465,17.01131041,16.973091372879324,17.06496535505383,0.07531618174714083,-0.0165578004273641
 0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.070573806762695,17.03725065,16.998979250542746,17.09178209803637,0.07159455621994937,-0.021208291273673296"""
 
-    out_files["QC_Measured_RTs"] = tmp_path / experiment / "root0/data_QC/QC_Measured_RTs.csv"
     expected[
-        "QC_Measured_RTs"
+        str(tmp_path / experiment / "root0/data_QC/QC_Measured_RTs.csv")
     ] = """,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5,mean,median,min,max,standard deviation,standard error,#NaNs
 0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.760883629322052,0.7883098125457764,0.7816191911697388,0.7698802947998047,0.775173231959343,0.7757497429847717,0.760883629322052,0.7883098125457764,0.012197377682005251,0.006098688841002626,0
 0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2340805530548096,1.2472213506698608,1.2510555982589722,1.2544312477111816,1.246697187423706,1.2491384744644165,1.2340805530548096,1.2544312477111816,0.008911895485740983,0.004455947742870492,0
@@ -178,20 +170,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
 0069_arginine_unlabeled_positive_M+H175p1190_16p94,16.963918685913086,16.961685180664062,16.988910675048828,16.9918212890625,16.97658395767212,16.976414680480957,16.961685180664062,16.9918212890625,0.01598443924833885,0.007992219624169425,0
 0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.043212890625,17.035064697265625,17.05360221862793,17.05995750427246,17.047959327697754,17.048407554626465,17.035064697265625,17.05995750427246,0.011024194876641502,0.005512097438320751,0
 0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.058874130249023,17.06319236755371,17.085508346557617,17.07795524597168,17.071382522583008,17.070573806762695,17.058874130249023,17.085508346557617,0.0124669979531353,0.00623349897656765,0"""
-
-    subprocess.run(
-        [
-            "docker",
-            "run",
-            "--rm",
-            "-v",
-            f"{os.getcwd()}:/src",
-            "-v",
-            f"{tmp_path}:/out",
-            image,
-            "/bin/bash",
-            "-c",
-            """\
+    command = """\
                     jq -M '(.cells[] | select(.source[] | contains("predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)")).source) \
                                 = ["predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path, model_only=True)"]' \
                                 /src/notebooks/reference/RT_Prediction.ipynb > /out/Remove.ipynb &&  \
@@ -203,23 +182,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
                         -p max_cpus 2 \
                         /out/Remove.ipynb \
                         /out/Remove-done.ipynb
-                   """,
-        ],
-        check=True,
-    )
-    files = subprocess.check_output(f"find {str(tmp_path)} -type f", shell=True, text=True).strip()
-    print(files)
-    num_files_created = int(
-        subprocess.check_output(f"find {str(tmp_path)} -type f | wc -l", shell=True, text=True).strip()
-    )
-    for _, path in out_files.items():
-        os.system(f"cat {path}")
-    assert num_files_created == 8
-    for metric_name, path in out_files.items():
-        with open(path, "r") as handle:
-            expected_lines = expected[metric_name].split("\n")
-            num = None
-            for num, line in enumerate(handle.readlines()):
-                clean_line = line.rstrip("\n")
-                assert expected_lines[num] == clean_line
-            assert len(expected_lines) == num + 1
+                   """
+    utils.exec_docker(image, command, tmp_path)
+    assert utils.num_files_in(tmp_path) == 8
+    utils.assert_files_match(expected)
diff --git a/tests/unit/test_predict_rt.py b/tests/unit/test_predict_rt.py
index 9052ab6b..e5f3ecd1 100644
--- a/tests/unit/test_predict_rt.py
+++ b/tests/unit/test_predict_rt.py
@@ -1,36 +1,14 @@
 """ unit testing of predict_rt functions """
 # pylint: disable=missing-function-docstring
 
-import os
-
-from metatlas.datastructures import metatlas_dataset as mads
 from metatlas.tools import predict_rt
 
 
 def test_get_rts01(metatlas_dataset):
+    # pylint: disable=line-too-long
     rts_df = predict_rt.get_rts(metatlas_dataset, include_atlas_rt_peak=False)
     assert f"{rts_df.iloc[0]['min']:0.5f}" == "2.29224"
-
-
-def test_get_rts02(
-    mocker, df_container, analysis_ids, qc_lcmsruns, sqlite_with_atlas, username, groups_controlled_vocab
-):
-    mocker.patch(
-        "metatlas.io.metatlas_get_data_helper_fun.df_container_from_metatlas_file", return_value=df_container
-    )
-    mocker.patch("metatlas.plots.dill2plots.get_metatlas_files", return_value=qc_lcmsruns)
-    ids = mads.AnalysisIdentifiers(
-        source_atlas=f"HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_{username}0",
-        experiment="20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583",
-        output_type="FinalEMA-HILIC",
-        polarity="positive",
-        analysis_number=0,
-        project_directory=str(os.getcwd()),
-        groups_controlled_vocab=groups_controlled_vocab,
-    )
-    metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
-    rts_df = predict_rt.get_rts(metatlas_dataset, include_atlas_rt_peak=False)
     assert (
         rts_df.to_json()
-        == """{"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"mean":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"median":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"min":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"max":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"standard deviation":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0.0},"standard error":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0.0},"#NaNs":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0}}"""
+        == """{"20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"mean":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"median":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"min":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"max":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":2.2922415733},"standard deviation":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":null},"standard error":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":null},"#NaNs":{"0000_2deoxyadenosine_positive_M+H252p1091_2p20":0}}"""
     )

From 9e8505a49dc001ba01b722a763f970d80dab69fe Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 15:59:54 -0700
Subject: [PATCH 103/177] fix naming of RT pred notebook from slurm

---
 papermill/launch_rt_prediction.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index 95b35ee5..5a0b4644 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -16,7 +16,7 @@ EXP_DIR="${PROJECT_DIR}/$EXP"
 ANALYSIS_DIR="${EXP_DIR}/${USER}${ANALYSIS_NUM}"
 
 IFS='_' read -ra TOKENS <<< "$EXP"
-PROPOSAL="${TOKENS[0]}"
+PROPOSAL="${TOKENS[3]}"
 
 export IN_FILE="${REPO_DIR}/notebooks/reference/RT_Prediction.ipynb"
 export OUT_FILE="${ANALYSIS_DIR}/${PROPOSAL}_RT_Prediction_papermill.ipynb"

From 269b351df9c4a21ad302c8b5fdd083eda1297863 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 16:54:03 -0700
Subject: [PATCH 104/177] Add peak_rt.tab generation to RT prediction

---
 metatlas/tools/predict_rt.py    | 16 +++++++++++++++-
 tests/system/test_rt_predict.py |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 90a797b7..d629597d 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -6,6 +6,7 @@
 import math
 import os
 
+from datetime import datetime
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -113,6 +114,7 @@ def generate_rt_correction_models(
     if len(metatlas_dataset) == 0:
         logger.error("No matching LCMS runs, terminating without generating outputs.")
         return
+    save_rt_peak(metatlas_dataset, os.path.join(ids.output_dir, "rt_peak.tab"))
     save_measured_rts(metatlas_dataset, os.path.join(ids.output_dir, "QC_Measured_RTs.csv"))
     rts_df = get_rts(metatlas_dataset)
     compound_atlas_rts_file_name = os.path.join(ids.output_dir, "Compound_Atlas_RTs.pdf")
@@ -144,10 +146,16 @@ def get_groups(metatlas_dataset):
                         to this list of strings
     """
     ordered_groups = sorted(metatlas_dataset.ids.groups, key=lambda x: x.name)
-    _ = [logger.info("Selected group: %s, %s", grp.name, grp.last_modified) for grp in ordered_groups]
+    for grp in ordered_groups:
+        logger.info("Selected group: %s, %s", grp.name, int_to_date_str(grp.last_modified))
     return ordered_groups
 
 
+def int_to_date_str(i_time):
+    """ unix epoc time in seconds to YYYY-MM-DD hh:mm:ss """
+    return str(datetime.fromtimestamp(i_time))
+
+
 def get_files_df(groups):
     """Pandas Datafram with one row per file plus columns for accquistion_time and group name"""
     files_df = pd.DataFrame(columns=["file", "time", "group"])
@@ -193,6 +201,12 @@ def save_measured_rts(metatlas_dataset, file_name):
     write_utils.export_dataframe_die_on_diff(rts_df, file_name, "measured RT values")
 
 
+def save_rt_peak(metatlas_dataset, file_name):
+    """Save peak RT values in tsv format file"""
+    rts_df = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname='rt_peak', use_labels=True)
+    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "peak RT values", sep='\t')
+
+
 def get_rts(metatlas_dataset, include_atlas_rt_peak=True):
     """Returns RT values in DataFrame format"""
     rts_df = dp.make_output_dataframe(
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index b4b9cc9f..ec39b703 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -184,5 +184,5 @@ def test_targeted_by_line01_with_remove(tmp_path):
                         /out/Remove-done.ipynb
                    """
     utils.exec_docker(image, command, tmp_path)
-    assert utils.num_files_in(tmp_path) == 8
+    assert utils.num_files_in(tmp_path) == 9
     utils.assert_files_match(expected)

From 90bec1d03028c59eaa0dba2b26d43ff420cf331b Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 24 Aug 2021 17:30:55 -0700
Subject: [PATCH 105/177] Fix typo in log message

---
 metatlas/io/targeted_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 2d87714e..ba1d15f9 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -309,7 +309,7 @@ def copy_outputs_to_google_drive(ids):
     Inputs:
         ids: an AnalysisIds object
     """
-    logger.info("Coping output files to Google Drive")
+    logger.info("Copying output files to Google Drive")
     rci = rclone.RClone(RCLONE_PATH)
     fail_suffix = "not copying files to Google Drive"
     if rci.config_file() is None:

From 06e682246f60992008e7950a4cbfb684e2e4dd11 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 25 Aug 2021 10:18:59 -0700
Subject: [PATCH 106/177] GDrive auto upload folder set to 'analysis_uploads'

---
 metatlas/io/targeted_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index ba1d15f9..37ea9ba4 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -319,6 +319,6 @@ def copy_outputs_to_google_drive(ids):
     if drive is None:
         logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
         return
-    sub_folder = os.path.join(ids.experiment, ids.analysis, ids.output_type)
+    sub_folder = os.path.join('analysis_uploads', ids.experiment, ids.analysis, ids.output_type)
     rci.copy_to_drive(ids.output_dir, drive, sub_folder)
     logger.info("Done copying output files to Google Drive")

From 60e81adc3b8ff4aaf675bd2011d4098d24ff720e Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 25 Aug 2021 12:25:38 -0700
Subject: [PATCH 107/177] refactor of make_chromatograms, could do more

---
 metatlas/plots/dill2plots.py | 50 ++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 84058542..dd42a21b 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -2186,42 +2186,36 @@ def get_msms_hits_with_warnings(metatlas_dataset, extra_time=False, keep_nonmatc
     return msms_hits
 
 
-def make_chromatograms(input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], group='index', share_y=True, save=True, output_loc=[], short_names_df=pd.DataFrame(), short_names_header=None, polarity='', overwrite=False, max_cpus=1):
-    input_dataset = filter_runs(input_dataset, include_lcmsruns, include_groups,
-                                exclude_lcmsruns, exclude_groups)
-    file_names = ma_data.get_file_names(input_dataset)
-
-    if short_names_df.empty:
-        if short_names_header != None:
-            sys.stdout.write('short_names_df not provided. Using full_filename for the plots!')
-            short_names_df = pd.DataFrame()
-    elif short_names_header == None:
-            sys.stdout.write('short_names_header not provided. Using full_filename for the plots!')
+def make_chromatograms(input_dataset, include_lcmsruns=None, exclude_lcmsruns=None, include_groups=None, exclude_groups=None, group='index', share_y=True, save=True, output_loc=None, short_names_df=None, short_names_header=None, polarity='', overwrite=False, max_cpus=1):
+    data = filter_runs(input_dataset, include_lcmsruns, include_groups, exclude_lcmsruns, exclude_groups)
+    file_names = ma_data.get_file_names(data)
+    if short_names_df is None:
+        if short_names_header is not None:
+            logger.info('short_names_df not provided. Using full_filename for the plots!')
             short_names_df = pd.DataFrame()
+    elif short_names_header is None:
+        logger.info('short_names_header not provided. Using full_filename for the plots!')
+        short_names_df = pd.DataFrame()
     elif short_names_header not in short_names_df.columns:
-            sys.stdout.write('short_names_header not found in short_names_df. Using full_filename for the plots!')
-            short_names_df = pd.DataFrame()
+        logger.info('short_names_header not found in short_names_df. Using full_filename for the plots!')
     else:
         short_names_df = short_names_df[[short_names_header]]
-        short_names_df.columns=['shortname']
-
+        short_names_df.columns = ['shortname']
     os.makedirs(output_loc, exist_ok=True)
-    compound_names = ma_data.get_compound_names(input_dataset,use_labels=True)[0]
-    args_list = []
-
+    compound_names = ma_data.get_compound_names(data, use_labels=True)[0]
     prefix = f"{polarity}_" if polarity != '' else ''
     chromatogram_dir = os.path.join(output_loc, f"{prefix}compound_EIC_chromatograms")
+    args_list = []
     for compound_idx, my_compound in enumerate(compound_names):
-        my_data = [input_dataset[file_idx][compound_idx] for file_idx, _ in enumerate(file_names)]
-        kwargs = {'data': my_data,
-                  'file_name': os.path.join(chromatogram_dir, my_compound+'.pdf'),
-                  'group': group,
-                  'save': save,
-                  'share_y': share_y,
-                  'names': file_names,
-                  'shortname': short_names_df,
-                  'overwrite': overwrite}
-        args_list.append(kwargs)
+        my_data = [data[file_idx][compound_idx] for file_idx, _ in enumerate(file_names)]
+        args_list.append({'data': my_data,
+                          'shortname': short_names_df,
+                          'group': group,
+                          'file_name': os.path.join(chromatogram_dir, my_compound+'.pdf'),
+                          'save': save,
+                          'share_y': share_y,
+                          'names': file_names,
+                          'overwrite': overwrite})
     parallel.parallel_process(cpp.chromplotplus, args_list, max_cpus, unit='plot', spread_args=False)
 
 

From df883e349c6724397354987d7f577a26484fbaba Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 25 Aug 2021 12:26:14 -0700
Subject: [PATCH 108/177] fix EIC plot grouping in outputs

---
 metatlas/datastructures/metatlas_dataset.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 3f90d3b8..9e98fda3 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -848,13 +848,13 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             self._hits = None  # force hits to be regenerated
         self.extra_time = 0.5
         logger.info("extra_time set to 0.5 minutes for output generation.")
-        targeted_output.write_atlas_to_spreadsheet(self, overwrite)
-        targeted_output.write_stats_table(self, overwrite)
-        targeted_output.write_chromatograms(self, overwrite, max_cpus=self.max_cpus)
-        targeted_output.write_identification_figure(self, overwrite)
-        targeted_output.write_metrics_and_boxplots(self, overwrite, max_cpus=self.max_cpus)
+        targeted_output.write_atlas_to_spreadsheet(self, overwrite=overwrite)
+        targeted_output.write_stats_table(self, overwrite=overwrite)
+        targeted_output.write_chromatograms(self, overwrite=overwrite, max_cpus=self.max_cpus)
+        targeted_output.write_identification_figure(self, overwrite=overwrite)
+        targeted_output.write_metrics_and_boxplots(self, overwrite=overwrite, max_cpus=self.max_cpus)
         if msms_fragment_ions:
-            targeted_output.write_msms_fragment_ions(self, overwrite)
+            targeted_output.write_msms_fragment_ions(self, overwrite=overwrite)
         logger.info("Generation of output files completed sucessfully.")
         targeted_output.archive_outputs(self.ids)
         targeted_output.copy_outputs_to_google_drive(self.ids)

From 430d4fc1ad934dfe1158923673c7fc8cf742a646 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 25 Aug 2021 17:58:26 -0700
Subject: [PATCH 109/177] Add type checking with mypy

---
 metatlas/datastructures/metatlas_dataset.py   |  61 +++----
 metatlas/datastructures/metatlas_objects.py   |  12 +-
 metatlas/io/directory_watcher.py              |   5 +-
 metatlas/io/h5_query.py                       |   2 +-
 metatlas/io/integrity_monitor.py              |  14 +-
 metatlas/io/metatlas_get_data_helper_fun.py   |  18 ---
 metatlas/io/targeted_output.py                |   2 +-
 metatlas/plots/dill2plots.py                  |  29 ----
 metatlas/tools/predict_rt.py                  |   6 +-
 metatlas/untargeted/mzmine_batch_tools.py     |  92 +++++------
 .../untargeted/mzmine_batch_tools_adap.py     | 150 +++++++++---------
 metatlas/untargeted/mzmine_helpers.py         | 138 ++++++++--------
 noxfile.py                                    |  16 ++
 pyproject.toml                                |  40 +++++
 14 files changed, 283 insertions(+), 302 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 9e98fda3..a86ba65e 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -6,11 +6,14 @@
 import os
 import shutil
 
+from typing import Any, Dict, List
+
 import humanize
 import pandas as pd
+import traitlets
 
 from traitlets import HasTraits, TraitError, default, observe, validate
-from traitlets import Bool, Float, Instance, Int, List, Tuple, Unicode
+from traitlets import Bool, Float, Instance, Int, Tuple, Unicode
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh
@@ -20,11 +23,11 @@
 from metatlas.plots import dill2plots as dp
 from metatlas.tools import parallel
 
-MSMS_REFS_PATH = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
-DEFAULT_GROUPS_CONTROLLED_VOCAB = ["QC", "InjBl", "ISTD"]
-OUTPUT_TYPES = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
-POLARITIES = ["positive", "negative", "fast-polarity-switching"]
-SHORT_POLARITIES = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
+MSMS_REFS_PATH: str = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+DEFAULT_GROUPS_CONTROLLED_VOCAB: List[str] = ["QC", "InjBl", "ISTD"]
+OUTPUT_TYPES: List[str] = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
+POLARITIES: List[str] = ["positive", "negative", "fast-polarity-switching"]
+SHORT_POLARITIES: Dict[str, str] = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
 
 logger = logging.getLogger(__name__)
 
@@ -32,26 +35,26 @@
 class AnalysisIdentifiers(HasTraits):
     """Names used in generating an analysis"""
 
-    source_atlas = Unicode(allow_none=True, default_value=None)
-    experiment = Unicode()
-    output_type = Unicode()
-    polarity = Unicode(default_value="positive")
-    analysis_number = Int(default_value=0)
-    username = Unicode(default_value=getpass.getuser())
-    project_directory = Unicode()
-    google_folder = Unicode()
-    exclude_files = List(trait=Unicode(), allow_none=True, default_value=[])
-    include_groups = List(allow_none=True, default_value=None)
-    exclude_groups = List(allow_none=True, default_value=None)
-    groups_controlled_vocab = List(
+    source_atlas: str = Unicode(allow_none=True, default_value=None)
+    experiment: str = Unicode()
+    output_type: str = Unicode()
+    polarity: str = Unicode(default_value="positive")
+    analysis_number: int = Int(default_value=0)
+    username: str = Unicode(default_value=getpass.getuser())
+    project_directory: str = Unicode()
+    google_folder: str = Unicode()
+    exclude_files: List[str] = traitlets.List(trait=Unicode(), allow_none=True, default_value=[])
+    include_groups: List[str] = traitlets.List(allow_none=True, default_value=None)
+    exclude_groups: List[str] = traitlets.List(allow_none=True, default_value=None)
+    groups_controlled_vocab: List[str] = traitlets.List(
         trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
     )
-    _lcmsruns = List(allow_none=True, default_value=None)
-    _all_groups = List(allow_none=True, default_value=None)
-    _groups = List(allow_none=True, default_value=None)
+    _lcmsruns: List[metob.LcmsRun] = traitlets.List(allow_none=True, default_value=None)
+    _all_groups: List[metob.Group] = traitlets.List(allow_none=True, default_value=None)
+    _groups: List[metob.Group] = traitlets.List(allow_none=True, default_value=None)
 
     # pylint: disable=no-self-use
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
         if self.polarity == "positive":
             self.exclude_groups.append("NEG")
@@ -67,14 +70,14 @@ def __init__(self, **kwargs):
         self.store_all_groups(exist_ok=True)
 
     @default("include_groups")
-    def _default_include_groups(self):
+    def _default_include_groups(self) -> List[str]:
         if self.output_type == "data_QC":
             return ["QC"]
         return []
 
     @default("exclude_groups")
-    def _default_exclude_groups(self):
-        out = ["InjBl", "InjBL"]
+    def _default_exclude_groups(self) -> List[str]:
+        out: List[str] = ["InjBl", "InjBL"]
         if self.output_type != "data_QC":
             out.append("QC")
         if self.polarity == "positive":
@@ -84,19 +87,19 @@ def _default_exclude_groups(self):
         return out
 
     @validate("polarity")
-    def _valid_polarity(self, proposal):
+    def _valid_polarity(self, proposal: Dict[str, Any]) -> str:
         if proposal["value"] not in POLARITIES:
             raise TraitError(f"Parameter polarity must be one of {', '.join(POLARITIES)}")
         return proposal["value"]
 
     @validate("output_type")
-    def _valid_output_type(self, proposal):
+    def _valid_output_type(self, proposal: Dict[str, Any]) -> str:
         if proposal["value"] not in OUTPUT_TYPES:
             raise TraitError(f"Parameter output_type must be one of {', '.join(OUTPUT_TYPES)}")
         return proposal["value"]
 
     @validate("source_atlas")
-    def _valid_source_atlas(self, proposal):
+    def _valid_source_atlas(self, proposal: Dict[str, Any]) -> str:
         if proposal["value"] is not None:
             try:
                 get_atlas(proposal["value"], self.username)  # raises error if not found or matches multiple
@@ -105,7 +108,7 @@ def _valid_source_atlas(self, proposal):
         return proposal["value"]
 
     @validate("analysis_number")
-    def _valid_analysis_number(self, proposal):
+    def _valid_analysis_number(self, proposal: Dict[str, Any]) -> int:
         if proposal["value"] < 0:
             raise TraitError("Parameter analysis_number cannot be negative.")
         return proposal["value"]
diff --git a/metatlas/datastructures/metatlas_objects.py b/metatlas/datastructures/metatlas_objects.py
index e84e3a72..4da3b048 100644
--- a/metatlas/datastructures/metatlas_objects.py
+++ b/metatlas/datastructures/metatlas_objects.py
@@ -7,6 +7,8 @@
 import time
 import uuid
 
+from typing import Dict
+
 from pwd import getpwuid
 from tabulate import tabulate
 import pandas as pd
@@ -479,7 +481,7 @@ class IdentificationGrade(MetatlasObject):
     pass
 
 
-ID_GRADES = dict()
+ID_GRADES: Dict[str, IdentificationGrade] = dict()
 
 
 class _IdGradeTrait(MetInstance):
@@ -713,11 +715,3 @@ def to_dataframe(objects):
     for col in ['last_modified', 'creation_time']:
         dataframe[col] = pd.to_datetime(dataframe[col], unit='s')
     return dataframe
-
-
-if __name__ == '__main__':
-    m1 = Group(name='spam')
-    store(m1)
-    m1.description = 'baz'
-    store(m1)
-    print((retrieve('group', name='spam')))
diff --git a/metatlas/io/directory_watcher.py b/metatlas/io/directory_watcher.py
index f9e16cf9..34b37c71 100644
--- a/metatlas/io/directory_watcher.py
+++ b/metatlas/io/directory_watcher.py
@@ -16,7 +16,8 @@
 from datetime import datetime, time as dtime
 
 from metatlas.mzml_loader import VERSION_TIMESTAMP
-from metatlas import LcmsRun, mzml_to_hdf, store, retrieve
+from metatlas.datastructures.metatlas_objects import LcmsRun, retrieve, store
+from metatlas.io.mzml_loader import mzml_to_hdf
 
 ADMIN = 'bpb'
 
@@ -142,7 +143,7 @@ def update_metatlas(directory):
         # Convert to HDF and store the entry in the database.
         try:
             hdf5_file = fname.replace('mzML', 'h5')
-            
+
             #Get Acquisition Time Here
             acquisition_time = get_acqtime_from_mzml(fname)
             mzml_to_hdf(fname, hdf5_file, True)
diff --git a/metatlas/io/h5_query.py b/metatlas/io/h5_query.py
index 9a2255e1..ea33a827 100644
--- a/metatlas/io/h5_query.py
+++ b/metatlas/io/h5_query.py
@@ -261,7 +261,7 @@ def get_info(h5file):
     import argparse
     import os
     import matplotlib.pyplot as plt
-    from metatlas import plot_chromatogram, plot_spectrogram, plot_heatmap
+    from metatlas.plots.plotting import plot_chromatogram, plot_spectrogram, plot_heatmap
 
     desc = "Query and plot MZML data from HDF files"
     parser = argparse.ArgumentParser(description=desc)
diff --git a/metatlas/io/integrity_monitor.py b/metatlas/io/integrity_monitor.py
index c49d15de..60a4f8e8 100644
--- a/metatlas/io/integrity_monitor.py
+++ b/metatlas/io/integrity_monitor.py
@@ -1,6 +1,3 @@
-# import argparse
-from __future__ import absolute_import
-from __future__ import print_function
 import smtplib
 import mimetypes
 import itertools
@@ -13,6 +10,7 @@
 import multiprocessing as mp
 import numpy as np
 
+from typing import Any, Dict
 from datetime import datetime, time as dtime
 from collections import defaultdict
 from metatlas.io.mzml_loader import VERSION_TIMESTAMP
@@ -24,8 +22,6 @@
 from metatlas.io import metatlas_get_data_helper_fun as ma_data
 from metatlas.metatlas_objects.metatlas_objects import find_invalid_runs
 from metatlas.io.system_utils import send_mail
-from six.moves import range
-from six.moves import zip
 
 
 # TO-DO: have these vars be defined from external YML (probably nersc.yml?)
@@ -41,7 +37,7 @@
 std_neg = 227.966564596
 std_name = 'ABMBA'
 rt_max = -1
-run_times = {}
+run_times: Dict[str, Any] = {}
 save_path = '/project/projectdirs/metatlas/projects/istd_logs/'
 
 
@@ -106,7 +102,7 @@ def get_ppms(dataset, tolerance=15, std=229.981116596, std_neg=227.966564596, th
     if data.empty:
         mz_t = std_neg
         ms1 = 'ms1_neg'
-        ms2 = 'ms2_neg'        
+        ms2 = 'ms2_neg'
 
     # Looking through ms1_pos
     # MS 1
@@ -177,7 +173,7 @@ def data_verify(file_name,tolerance=tolerance,std=std,std_neg=std_neg,threshold=
     else:
         #its a metatlas object
         samples_dict['acquisition timestamp'] = file_name.acquisition_time
-        
+
     samples_dict['acquisition time'] = datetime.fromtimestamp(
         samples_dict['acquisition timestamp']).strftime('%Y-%m-%d %H:%M:%S')
 
@@ -354,7 +350,7 @@ def run_checker():
     # parser = argparse.ArgumentParser(description="Parameters for custom compounds")
     # parser.add_argument('-mz_pos', '--mz_pos', type=float, required=False)
     # parser.add_argument('-mz_neg', '--mz_neg', type=float, required=False)
-    
+
     # args = parser.parse_args()
     # if args.mz_pos is not None:
     #     std = args.mz_pos
diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 06d575ae..426e985b 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -481,24 +481,6 @@ def organize_msms_scan_data(data,list_of_prt,list_of_pmz,list_of_pintensity):
         msms_data['precursor_intensity'].append(pintensity)
     return msms_data
 
-def retrieve_most_intense_msms_scan(data):
-    urt,idx = np.unique(data['rt'],return_index=True)
-    sx = np.argsort(data['precursor_intensity'][idx])[::-1]
-    prt = data['rt'][idx[sx]]
-    pmz = data['precursor_MZ'][idx[sx]]
-    pintensity = data['precursor_intensity'][idx[sx]]
-    #setup data format for searching
-    msms_data = {}
-    msms_data['spectra'] = []
-    msms_data['precursor_mz'] = []
-    msms_data['precursor_intensity'] = []
-    idx = np.argwhere((data['precursor_MZ'] == pmz[0]) & (data['rt'] == prt[0] )).flatten()
-    arr = np.array([data['mz'][idx], data['i'][idx]]).T
-    msms_data['spectra'] = arr
-    msms_data['precursor_mz'] = pmz
-    msms_data['precursor_intensity'] = pintensity
-    return msms_data
-
 
 def get_data_for_a_compound(mz_ref,rt_ref,what_to_get,h5file,extra_time):
     """
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 37ea9ba4..ddabadd3 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -319,6 +319,6 @@ def copy_outputs_to_google_drive(ids):
     if drive is None:
         logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
         return
-    sub_folder = os.path.join('analysis_uploads', ids.experiment, ids.analysis, ids.output_type)
+    sub_folder = os.path.join("analysis_uploads", ids.experiment, ids.analysis, ids.output_type)
     rci.copy_to_drive(ids.output_dir, drive, sub_folder)
     logger.info("Done copying output files to Google Drive")
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index dd42a21b..53de0b54 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1233,35 +1233,6 @@ def plot_all_files_for_each_compound(input_dataset = [], input_fname = '', inclu
         plt.close(fig)
 
 
-
-
-
-
-""" contribution from Hans de Winter """
-def _InitialiseNeutralisationReactions():
-    patts= (
-        # Imidazoles
-        ('[n+;H]','n'),
-        # Amines
-        ('[N+;!H0]','N'),
-        # Carboxylic acids and alcohols
-        ('[$([O-]);!$([O-][#7])]','O'),
-        # Thiols
-        ('[S-;X1]','S'),
-        # Sulfonamides
-        ('[$([N-;X2]S(=O)=O)]','N'),
-        # Enamines
-        ('[$([N-;X2][C,N]=C)]','N'),
-        # Tetrazoles
-        ('[n-]','[nH]'),
-        # Sulfoxides
-        ('[$([S-]=O)]','S'),
-        # Amides
-        ('[$([N-]C=O)]','N'),
-        )
-    return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]
-
-
 def desalt(mol):
     #input is an rdkit mol
     #returns an rdkit mol keeping the biggest component
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index d629597d..08f82108 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -152,7 +152,7 @@ def get_groups(metatlas_dataset):
 
 
 def int_to_date_str(i_time):
-    """ unix epoc time in seconds to YYYY-MM-DD hh:mm:ss """
+    """unix epoc time in seconds to YYYY-MM-DD hh:mm:ss"""
     return str(datetime.fromtimestamp(i_time))
 
 
@@ -203,8 +203,8 @@ def save_measured_rts(metatlas_dataset, file_name):
 
 def save_rt_peak(metatlas_dataset, file_name):
     """Save peak RT values in tsv format file"""
-    rts_df = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname='rt_peak', use_labels=True)
-    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "peak RT values", sep='\t')
+    rts_df = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname="rt_peak", use_labels=True)
+    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "peak RT values", sep="\t")
 
 
 def get_rts(metatlas_dataset, include_atlas_rt_peak=True):
diff --git a/metatlas/untargeted/mzmine_batch_tools.py b/metatlas/untargeted/mzmine_batch_tools.py
index 3647981f..fd50f2d9 100644
--- a/metatlas/untargeted/mzmine_batch_tools.py
+++ b/metatlas/untargeted/mzmine_batch_tools.py
@@ -14,12 +14,6 @@
 from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
 from metatlas.helpers import dill2plots as dp
 import six
-from six.moves import map
-
-try:
-    six.string_types
-except NameError:  # python3
-    six.string_types = str
 
 BATCH_FILE_PATH = '/global/common/software/m2650/mzmine_parameters/batch_files/'
 BINARY_PATH = '/global/common/software/m2650/mzmine_parameters/MZmine'
@@ -143,7 +137,7 @@ def make_task_and_job(params):#basedir,basename,polarity,files):
 
 def create_job_script(m):
     """
-    
+
     This is the first function that runs when a user initializes a new untargeted workflow
 
     """
@@ -268,7 +262,7 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
     project_name = '%s_%s'%(basename,task.polarity)
     task.output_workspace = os.path.join(basedir,project_name,'%s_%s.mzmine'%(basename,task.polarity))
     task.input_xml = os.path.join(basedir,'logs','%s_%s_filtered.xml'%(basename,task.polarity))
-    
+
     task.mzmine_launcher = get_latest_mzmine_binary()
 
     # new_d = configure_crop_filter(new_d,task.polarity,files)
@@ -284,25 +278,25 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
 def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_tolerance=1e-4,noise_level=1e4,mz_tolerance=20,rt_tolerance=0.5):
     """
     Name suffix: Suffix to be added to the peak list name.
-    
-    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns. 
+
+    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns.
     The first column should contain the expected M/Z, the second column the expected RT and the third the peak name. Each peak should be in a different row.
-    
+
     Field separator: Character(s) used to separate fields in the peak list file.
-    
+
     Ignore first line: Check to ignore the first line of peak list file.
-    
+
     Intensity tolerance: This value sets the maximum allowed deviation from expected shape of a peak in chromatographic direction.
-    
+
     Noise level: The minimum intensity level for a data point to be considered part of a chromatogram. All data points below this intensity level are ignored.
-    
+
     MZ Tolerance: Maximum allowed m/z difference to find the peak
-    
+
     RT tolerance: Maximum allowed retention time difference to find the peak
     """
     # Set the noise floor
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'TargetedPeakDetectionModule' in d['@method']][0]
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Peak list file' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%peak_list_filename
 
@@ -322,7 +316,7 @@ def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_toleran
 
 def configure_crop_filter(new_d,polarity,files,min_rt=0.01,max_rt=100,fps_string='FPS'):
     """
-    
+
     """
     # identify the element for this change
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CropFilterModule' in d['@method']][0]
@@ -336,44 +330,44 @@ def configure_crop_filter(new_d,polarity,files,min_rt=0.01,max_rt=100,fps_string
     # Set the polarity
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['polarity'] = polarity.upper()
-    
+
     #set the rt min and rt max use the same idx2 as polarity
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['retention_time'] = {'max':'%.4f'%max_rt,'min':'%.4f'%min_rt}
-    
+
     # new_d['batch']['batchstep'][idx]['parameter'][idx2]['ms_level'] = '1-2'
 
     return new_d
 
 def configure_mass_detection(new_d,ms1_noise_level=1e4,ms2_noise_level=1e2):
     """
-    
+
     """
     # Find the module
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'MassDetectionModule' in d['@method']]
     #The first idx will be for MS1 and the second will be for MS2
-    
+
     # Set the MS1 attributes
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Mass detector' in d['@name']][0]
     idx3 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['module']) if 'Centroid' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['module'][idx3]['parameter']['#text'] = '%.2f'%(ms1_noise_level)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['ms_level'] = '1'
-    
+
     # Set the MS2 attributes
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Mass detector' in d['@name']][0]
     idx3 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['module']) if 'Centroid' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['module'][idx3]['parameter']['#text'] = '%.2f'%(ms2_noise_level)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['ms_level'] = '2'
 
-    
+
     return new_d
 
 def configure_chromatogram_builder(new_d,min_peak_duration,min_peak_height,mz_tolerance):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'ChromatogramBuilderModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Min time span' in d['@name']][0]
@@ -384,13 +378,13 @@ def configure_chromatogram_builder(new_d,min_peak_duration,min_peak_height,mz_to
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-   
+
     return new_d
 
 
 def configure_peak_deconvolution(new_d,min_peak_height,minimum_relative_height,search_for_minimum_rt_range,chromatographic_threshold,min_sn_ratio,min_peak_duration,max_peak_duration):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DeconvolutionModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Algorithm' in d['@name']][0]
@@ -412,14 +406,14 @@ def configure_peak_deconvolution(new_d,min_peak_height,minimum_relative_height,s
 
 def configure_isotope_search(new_d,mz_tolerance,rt_tol_perfile,representative_isotope,remove_isotopes):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'Isotope' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Retention time tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%.3f'%(rt_tol_perfile)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Representative isotope' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%(representative_isotope)
 
@@ -429,7 +423,7 @@ def configure_isotope_search(new_d,mz_tolerance,rt_tol_perfile,representative_is
 
 def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'JoinAlignerModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -440,7 +434,7 @@ def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
 
 def configure_rows_filter(new_d,min_peaks_in_row,peak_with_msms):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'RowsFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Minimum peaks in a row' in d['@name']][0]
@@ -453,7 +447,7 @@ def configure_rows_filter(new_d,min_peaks_in_row,peak_with_msms):
 
 def configure_duplicate_filter(new_d,mz_tolerance,rt_tol_perfile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DuplicateFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -464,39 +458,39 @@ def configure_duplicate_filter(new_d,mz_tolerance,rt_tol_perfile):
 
 def configure_gap_filling(new_d,mz_tolerance,gapfill_intensity_tolerance,rt_tol_multifile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'gapfilling.peakfinder.PeakFinderModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-    
-    
+
+
     return new_d
 
 def configure_output(new_d,output_csv_height,output_csv_area,output_workspace,output_mgf):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CSVExportModule' in d['@method']]
     #the first will be height the second will be area
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['#text'] = output_csv_height
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['#text'] = output_csv_area
-    
+
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'GNPSExportModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = output_mgf
-    
+
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'ProjectSaveAsModule' in d['@method']][0]
     new_d['batch']['batchstep'][idx]['parameter']['#text'] = output_workspace
     return new_d
 
 def configure_csv_output(new_d,output_csv):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CSVExportModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Filename' in d['@name']][0]
@@ -555,7 +549,7 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
     cp ../MZmine-2.24/startMZmine_NERSC_* .
     cd /project/projectdirs/metatlas/projects/
     chgrp -R metatlas mzmine_parameters
-    chmod -R 770 mzmine_parameters 
+    chmod -R 770 mzmine_parameters
     """
     mzmine_versions = glob.glob(os.path.join(BINARY_PATH,'*' + os.path.sep))
     if version == 'most_recent':
@@ -571,11 +565,11 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
 def replace_files(d,file_list):
     """
     Replace files for mzmine task
-    
+
     Inputs:
     d: an xml derived dictionary of batch commands
     file_list: a list of full paths to mzML files
-    
+
     Outputs:
     d: an xml derived dict with new files in it
     """
@@ -619,16 +613,16 @@ def dict_to_etree(d):
     def _to_etree(d, root):
         if not d:
             pass
-        elif isinstance(d, six.string_types):
+        elif isinstance(d, str):
             root.text = d
         elif isinstance(d, dict):
             for k,v in d.items():
-                assert isinstance(k, six.string_types)
+                assert isinstance(k, str)
                 if k.startswith('#'):
-                    assert k == '#text' and isinstance(v, six.string_types)
+                    assert k == '#text' and isinstance(v, str)
                     root.text = v
                 elif k.startswith('@'):
-                    assert isinstance(v, six.string_types)
+                    assert isinstance(v, str)
                     root.set(k[1:], v)
                 elif isinstance(v, list):
                     for e in v:
diff --git a/metatlas/untargeted/mzmine_batch_tools_adap.py b/metatlas/untargeted/mzmine_batch_tools_adap.py
index 7fe36760..aaaf8f50 100644
--- a/metatlas/untargeted/mzmine_batch_tools_adap.py
+++ b/metatlas/untargeted/mzmine_batch_tools_adap.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
 import numpy as np
 import sys
 import json
@@ -18,12 +16,7 @@
 from collections import Mapping
 import six
 from pathlib2 import PurePath
-from six.moves import map
 
-try:
-    six.string_types
-except NameError:  # python3
-    six.string_types = str
 
 BATCH_FILE_PATH = '/global/common/software/m2650/mzmine_parameters/batch_files/'
 BINARY_PATH = '/global/common/software/m2650/mzmine_parameters/MZmine'
@@ -32,7 +25,7 @@
 # new stuff:
 
 
-    
+
 
 
 #     <batchstep method="net.sf.mzmine.modules.peaklistmethods.orderpeaklists.OrderPeakListsModule">
@@ -42,9 +35,9 @@
 
 
 
-    
 
-    
+
+
 #copy files here to keep I/O off low-performance filesystems
 DATA_PATH = '/global/cscratch1/sd/bpb/raw_data'
 
@@ -101,7 +94,7 @@
 
 #Alicia Clum: The best nodes we have right now are ExVivo, they are 1.5 Tb nodes and very fast you can submit there by changing to --qos=jgi_shared and adding -C skylake. Prior to submitting you must type "module load esslurm" since these nodes are controlled by a different scheduler.
 # Set the python to this one:
-#/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori_exvivo.sh 
+#/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori_exvivo.sh
 # /////////////////////////////////////////////////////////////////////
 # /////////////////// SKYLAKE 1.5TB QUEUE SBATCH PARAMS ///////////////
 # /////////////////////////////////////////////////////////////////////
@@ -133,7 +126,7 @@ def calc_hit_vector(n,df):
         if (i>(len(m)-1)) & (len(m) >1):
             m_idx = len(m)-1
         else:
-            m_idx = nf[i] 
+            m_idx = nf[i]
         m[m_idx] = m[m_idx] + s[i]
     return m
 
@@ -239,7 +232,7 @@ def mzmine_xml_to_csv(xml_file,csv_file=None,pop_input_files=True,return_df=True
 #     t = dict_to_etree(d)
 #     indent_tree(t)
 #     s1 = tree_to_xml(t)
-    
+
     # pop out the files
     if pop_input_files==True:
         raw_data_import = d['batch']['batchstep'].pop(0)
@@ -252,7 +245,7 @@ def mzmine_xml_to_csv(xml_file,csv_file=None,pop_input_files=True,return_df=True
     df = pd.DataFrame([(k,v) for (k,v) in dflat.items()],columns=['parameter','value']).sort_values('parameter').set_index('parameter',drop=True)
     if csv_file is not None:
         df.to_csv(csv_file)
-    
+
     if return_df==True:
         return df #return the dataframe of the steps
     else:
@@ -307,7 +300,7 @@ def make_task_and_job(params):#basedir,basename,polarity,files):
     new_d = configure_mass_detection(new_d,task.ms1_noise_level,task.ms2_noise_level)
 
     new_d = configure_chromatogram_builder(new_d,task.min_num_scans,task.group_intensity_threshold,task.min_peak_height,task.mz_tolerance)
-    
+
     new_d = configure_smoothing(new_d,task.smoothing_scans)
 
     new_d = configure_peak_deconvolution(new_d,
@@ -348,7 +341,7 @@ def make_task_and_job(params):#basedir,basename,polarity,files):
 
 def create_job_script(m):
     """
-    
+
     This is the first function that runs when a user initializes a new untargeted workflow
 
     """
@@ -446,7 +439,7 @@ def get_files(groups,filename_substring,file_filters,keep_strings,is_group=False
             all_files.extend(new_files)
         if len(new_files) == 0:
             print('##### %s has ZERO files!'%g)
-            
+
     # only keep files that don't have substrings in list
     if len(file_filters) > 0:
         for i,ff in enumerate(file_filters):
@@ -456,7 +449,7 @@ def get_files(groups,filename_substring,file_filters,keep_strings,is_group=False
                 files = [f for f in files if not ff in f.name]
     else:
         files = all_files
-        
+
     # kick out any files that don't match atleast one of the keep_strings
     keep_this = []
     filter_used = [] #good to keep track if a filter isn't used.  likely a typo
@@ -469,9 +462,9 @@ def get_files(groups,filename_substring,file_filters,keep_strings,is_group=False
             for i,f in enumerate(filter_used):
                 if f==False:
                     print('%s keep string is not used'%keep_strings[i])
-                    
+
         files = [files[i] for i,j in enumerate(keep_this) if j==True]
-    
+
     files = remove_duplicate_files(files)
     return files
 
@@ -491,7 +484,7 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
     project_name = '%s_%s'%(basename,task.polarity)
     task.output_workspace = os.path.join(basedir,project_name,'%s_%s.mzmine'%(basename,task.polarity))
     task.input_xml = os.path.join(basedir,'logs','%s_%s_filtered.xml'%(basename,task.polarity))
-    
+
     task.mzmine_launcher = get_latest_mzmine_binary()
 
     # new_d = configure_crop_filter(new_d,task.polarity,files)
@@ -507,25 +500,25 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
 def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_tolerance=1e-4,noise_level=1e4,mz_tolerance=20,rt_tolerance=0.5):
     """
     Name suffix: Suffix to be added to the peak list name.
-    
-    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns. 
+
+    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns.
     The first column should contain the expected M/Z, the second column the expected RT and the third the peak name. Each peak should be in a different row.
-    
+
     Field separator: Character(s) used to separate fields in the peak list file.
-    
+
     Ignore first line: Check to ignore the first line of peak list file.
-    
+
     Intensity tolerance: This value sets the maximum allowed deviation from expected shape of a peak in chromatographic direction.
-    
+
     Noise level: The minimum intensity level for a data point to be considered part of a chromatogram. All data points below this intensity level are ignored.
-    
+
     MZ Tolerance: Maximum allowed m/z difference to find the peak
-    
+
     RT tolerance: Maximum allowed retention time difference to find the peak
     """
     # Set the noise floor
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'TargetedPeakDetectionModule' in d['@method']][0]
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Peak list file' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%peak_list_filename
 
@@ -545,7 +538,7 @@ def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_toleran
 
 def configure_crop_filter(new_d,polarity,files,min_rt=0.01,max_rt=100,fps_string='FPS'):
     """
-    
+
     """
     # identify the element for this change
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CropFilterModule' in d['@method']][0]
@@ -559,39 +552,39 @@ def configure_crop_filter(new_d,polarity,files,min_rt=0.01,max_rt=100,fps_string
     # Set the polarity
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['polarity'] = polarity.upper()
-    
+
     #set the rt min and rt max use the same idx2 as polarity
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['retention_time'] = {'max':'%.4f'%max_rt,'min':'%.4f'%min_rt}
-    
+
     # new_d['batch']['batchstep'][idx]['parameter'][idx2]['ms_level'] = '1-2'
 
     return new_d
 
 def configure_mass_detection(new_d,ms1_noise_level=1e4,ms2_noise_level=1e2):
     """
-    
+
     """
     # Find the module
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'MassDetectionModule' in d['@method']]
     #The first idx will be for MS1 and the second will be for MS2
-    
+
     # Set the MS1 attributes
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Mass detector' in d['@name']][0]
     idx3 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['module']) if 'Centroid' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['module'][idx3]['parameter']['#text'] = '%.2f'%(ms1_noise_level)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['ms_level'] = '1'
-    
+
     # Set the MS2 attributes
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Mass detector' in d['@name']][0]
     idx3 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['module']) if 'Centroid' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['module'][idx3]['parameter']['#text'] = '%.2f'%(ms2_noise_level)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['ms_level'] = '2'
 
-    
+
     return new_d
 
 def configure_smoothing(new_d,smoothing_scans):
@@ -644,13 +637,13 @@ def configure_chromatogram_builder(new_d,min_num_scans,group_intensity_threshold
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-   
+
     return new_d
 
 def configure_adap_peak_deconvolution(new_d,min_peak_height,minimum_relative_height,search_for_minimum_rt_range,chromatographic_threshold,min_sn_ratio,min_peak_duration,max_peak_duration):
     """
     <parameter name="Algorithm" selected="Wavelets (ADAP)">
-    
+
     <module name="Wavelets (ADAP)">
                 <parameter name="S/N threshold">3.0</parameter>
                 <parameter name="S/N estimator" selected="Intensity window SN">
@@ -671,7 +664,7 @@ def configure_adap_peak_deconvolution(new_d,min_peak_height,minimum_relative_hei
                     <max>0.1</max>
                 </parameter>
             </module>
-            
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DeconvolutionModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Algorithm' in d['@name']][0]
@@ -705,7 +698,7 @@ def configure_lms_peak_deconvolution(new_d,min_peak_height,minimum_relative_heig
                     <max>1.0</max>
                 </parameter>
             </module>
-            
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DeconvolutionModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Algorithm' in d['@name']][0]
@@ -727,20 +720,20 @@ def configure_lms_peak_deconvolution(new_d,min_peak_height,minimum_relative_heig
 
 def configure_isotope_search(new_d,mz_tolerance,rt_tol_perfile,representative_isotope,remove_isotopes,polarity):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'Isotope' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Retention time tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%.3f'%(rt_tol_perfile)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Representative isotope' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%(representative_isotope)
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Remove original peaklist' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%(str(remove_isotopes).lower())
-    
+
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'Adduct' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
@@ -755,7 +748,7 @@ def configure_isotope_search(new_d,mz_tolerance,rt_tol_perfile,representative_is
                 new_d['batch']['batchstep'][idx]['parameter'][idx2]['adduct'][i]['@selected'] = 'false'
             else:
                 new_d['batch']['batchstep'][idx]['parameter'][idx2]['adduct'][i]['@selected'] = 'true'
-    
+
     return new_d
 
 def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
@@ -763,26 +756,26 @@ def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
     # Join aligner has these scores:
 #             <parameter name="Minimum absolute intensity">3000.0</parameter>
 #             <parameter name="Minimum score">0.6</parameter>
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'JoinAlignerModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Retention time tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%.3f'%(rt_tol_multifile)
-    
+
 #     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Minimum absolute intensity' in d['@name']][0]
 #     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = 3000#'%.3f'%(mz_tolerance)
-    
+
 #     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Minimum score' in d['@name']][0]
 #     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = 0.6#'%.3f'%(rt_tol_multifile)
-    
+
     return new_d
 
 def configure_rows_filter(new_d,min_peaks_in_row,peak_with_msms):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'RowsFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Minimum peaks in a row' in d['@name']][0]
@@ -795,7 +788,7 @@ def configure_rows_filter(new_d,min_peaks_in_row,peak_with_msms):
 
 def configure_duplicate_filter(new_d,mz_tolerance,rt_tol_perfile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DuplicateFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -817,46 +810,46 @@ def configure_gap_filling(new_d,mz_tolerance,gapfill_intensity_tolerance,rt_tol_
 #         <parameter name="Retention time tolerance" type="absolute">0.03</parameter>
 #         <parameter name="Remove original peak list">false</parameter>
 #     </batchstep>
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'gapfilling.peakfinder' in d['@method']][0]
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Intensity tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%.3f'%(gapfill_intensity_tolerance)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Retention time tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%.3f'%(rt_tol_multifile)
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-    
-    
+
+
     return new_d
 
 def configure_output(new_d,output_csv_height,output_csv_area,output_workspace,output_mgf):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CSVExportModule' in d['@method']]
     #the first will be height the second will be area
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[0]]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx[0]]['parameter'][idx2]['#text'] = output_csv_height
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx[1]]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx[1]]['parameter'][idx2]['#text'] = output_csv_area
-    
+
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'GNPSExportModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Filename' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = output_mgf
-    
+
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'ProjectSaveAsModule' in d['@method']][0]
     new_d['batch']['batchstep'][idx]['parameter']['#text'] = output_workspace
     return new_d
 
 def configure_csv_output(new_d,output_csv):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CSVExportModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Filename' in d['@name']][0]
@@ -915,7 +908,7 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
     cp ../MZmine-2.24/startMZmine_NERSC_* .
     cd /project/projectdirs/metatlas/projects/
     chgrp -R metatlas mzmine_parameters
-    chmod -R 770 mzmine_parameters 
+    chmod -R 770 mzmine_parameters
     """
     mzmine_versions = glob.glob(os.path.join(BINARY_PATH,'*' + os.path.sep))
     if version == 'most_recent':
@@ -931,11 +924,11 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
 def replace_files(d,file_list):
     """
     Replace files for mzmine task
-    
+
     Inputs:
     d: an xml derived dictionary of batch commands
     file_list: a list of full paths to mzML files
-    
+
     Outputs:
     d: an xml derived dict with new files in it
     """
@@ -986,17 +979,17 @@ def _to_etree(d, root):
 
         if type(d) is {}.values().__class__:
             d = list(d.values)
-        
-        if isinstance(d, six.string_types):
+
+        if isinstance(d, str):
             root.text = d
         elif isinstance(d, dict):
             for k,v in d.items():
-                assert isinstance(k, six.string_types)
+                assert isinstance(k, str)
                 if k.startswith('#'):
-                    assert k == '#text' and isinstance(v, six.string_types)
+                    assert k == '#text' and isinstance(v, str)
                     root.text = v
                 elif k.startswith('@'):
-                    assert isinstance(v, six.string_types)
+                    assert isinstance(v, str)
                     root.set(k[1:], v)
                 elif isinstance(v, list):
                     for e in v:
@@ -1079,7 +1072,7 @@ def path_reducer(k1, k2):
         return k2
     else:
         return os.path.join(k1, k2)
-    
+
 
 
 def tuple_splitter(flat_key):
@@ -1169,8 +1162,8 @@ def _flatten(d, parent=None):
 #         return
 #     d = d.setdefault(key, {})
 #     nested_set_dict(d, keys[1:], value)
-    
-    
+
+
 def nested_set_dict(d, keys, value):
     """Set a value to a sequence of nested keys
 
@@ -1226,7 +1219,7 @@ def unflatten(d, splitter='tuple', inverse=False):
                                       {'e4':'so_nested4b!!!'},
                                       {'e4':'so_nested4c!!!'},
                                       {'e4':'so_nested4d!!!'},
-                                      {'e4':'so_nested4e!!!'}]}]}}    
+                                      {'e4':'so_nested4e!!!'}]}]}}
 
     Returns
     -------
@@ -1234,7 +1227,7 @@ def unflatten(d, splitter='tuple', inverse=False):
     """
     if isinstance(splitter, str):
         splitter = SPLITTER_DICT[splitter]
-    
+
     kv = sorted([(k,v) for (k,v) in d.items()])
     unflattened_dict = {}
     for kkvv in kv:
@@ -1243,4 +1236,3 @@ def unflatten(d, splitter='tuple', inverse=False):
         nested_set_dict(unflattened_dict, key_tuple, value)
 
     return unflattened_dict
-
diff --git a/metatlas/untargeted/mzmine_helpers.py b/metatlas/untargeted/mzmine_helpers.py
index 0e81e431..75d8b9c9 100644
--- a/metatlas/untargeted/mzmine_helpers.py
+++ b/metatlas/untargeted/mzmine_helpers.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
 import numpy as np
 import sys
 import json
@@ -17,13 +15,6 @@
 from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
 from metatlas.helpers import dill2plots as dp
 import six
-from six.moves import map
-from six.moves import range
-
-try:
-    six.string_types
-except NameError:  # python3
-    six.string_types = str
 
 # setting this very high easily causes out of memory
 NUM_THREADS = 12
@@ -128,7 +119,7 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
         A simplified logic is: is max intensity near the rt_peak? if yes, is it decaying by 10x +/- 0.5 minutes. Obviously, peaks that are +/- 0.5 minutes appart will get removed. Thus, find local minimum +/- 0.5 minutes from rt_peak. Set those as rt_min and rt_max.
     Third remove no MS/MS
         exclude if no feature in non-blank samples has msms
-        
+
     Last: only keep n_peaks by max intensity in any sample.
 
     """
@@ -136,7 +127,7 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
 
     if not json_filename:
         json_filename = sys.argv[1]
-    with open(json_filename) as data_file:    
+    with open(json_filename) as data_file:
         params = json.load(data_file)
     file_to_convert = os.path.join(params['basedir'],'intermediate_results','%s_%s.csv'%(params['basename'],params['polarity']))
     print('# Working on %s %s'%(params['basename'],params['polarity']))
@@ -150,9 +141,9 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
         if do_test == True:
             #if only a test, just do a couple of features
             df = df.head(10)
-            
+
         df.to_csv(file_to_convert.replace('.csv','') + '_formatted.csv',index=True) #save a simplified mzmine-like csv as a backup of all features found.
-        
+
         #Filter features found in the blank
         df = clean_up_mzmine_dataframe(df)
         df_blank_compare = df.transpose().groupby(['b' if any([s in g.lower() for s in params['blank_str']]) else 's' for g in df.columns]).max().transpose()
@@ -165,8 +156,8 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
             df_features_not_in_blank = df_blank_compare
         df_features_not_in_blank.reset_index(inplace=True)
         print('#There are now %d features not in blank'%df_features_not_in_blank.shape[0])
-        
-        
+
+
         #Make an Atlas
         print('making new atlas')
         cids = []
@@ -180,11 +171,11 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
             cids.append(my_id)
         my_atlas = metob.Atlas(name='untargeted atlas',compound_identifications=cids)
         atlas_df = ma_data.make_atlas_df(my_atlas)
-        
+
         #Make Groups
         print('making groups')
         all_files = [f.replace('Peak height','').replace('filtered','').strip() for f in df.columns if '.mzML' in f]
-        metatlas_files = []      
+        metatlas_files = []
         for f in all_files:
             f = metob.retrieve('Lcmsruns',name=f,username='*')[-1]
             if isinstance(f,type(metob.LcmsRun())):
@@ -193,7 +184,7 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
                 print('%s NOT FOUND'%f)
                 break
         groups = metob.Group(name='untargeted group',items=metatlas_files)
-        
+
         #Get Data
         print('getting data')
         print('using',NUM_THREADS,'cores')
@@ -247,7 +238,7 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
         atlas_df = atlas_df[~atlas_df.index.isin(bads)]
         my_atlas.compound_identifications = [my_atlas.compound_identifications[idx] for idx in atlas_df.index[~atlas_df.index.isin(bads)].tolist()]
 
-        #Get Data 
+        #Get Data
         print('getting data after duplicate peak removal')
         print('using',NUM_THREADS,'cores')
         all_files = []
@@ -294,17 +285,17 @@ def clean_and_filter_mzmine_output(json_filename=None,n_peaks=3000,do_test=False
         print('# Getting data 1')
         make_figures_from_filtered_data(params,all_files,my_atlas)
 
-        
+
 def peak_height_df(metatlas_dataset,attribute='peak_height',zero_nans=True):
     """
-    Turn a list of lists in a metatlas dataset into a 
+    Turn a list of lists in a metatlas dataset into a
     peak height dataframe where rows are features
     and columns are samples
-    
+
     Valid attributes are:'mz_centroid','mz_peak',
     'num_ms1_datapoints','peak_area','peak_height',
     'rt_centroid','rt_peak'
-    
+
     infs, nans, and nulls are converted to zero by default.
     """
     d = []
@@ -323,7 +314,7 @@ def peak_height_df(metatlas_dataset,attribute='peak_height',zero_nans=True):
 
 def peak_in_top_n(metatlas_dataset,n_peaks=1000,prior_boolean=None):
     """
-    
+
     """
     df = peak_height_df(metatlas_dataset)
     if prior_boolean is not None: #make dataframe
@@ -340,22 +331,22 @@ def peak_in_top_n(metatlas_dataset,n_peaks=1000,prior_boolean=None):
 def metatlas_formatted_atlas_from_mzmine_output(filename,polarity,make_atlas=True,atlas_name=None,
     do_store=False,min_rt=None,max_rt=None,min_mz=None,mz_tolerance=8,
     max_mz=None,remove_adducts=False,remove_fragments=False,remove_clusters=False,max_duration=1.0):
-    # 
+    #
     '''
     Turn mzmine output into conforming metatlas_atlas input
-    
+
     Input:
     filename: csv file from mzmine output
     polarity: (positive,negative)
     atlas_name: string describing the atlas. useful incase you want to save it later.
-    
+
     Output:
     atlas_df: dataframe of atlas content
     myAtlas: metatlas atlas object (if make_atlas=True)
     mzmine_df: dataframe of all mzmine info (if make_atlas=False)
 
     '''
-    
+
     mzmine_df = pd.read_csv(filename)
     if min_rt:
         mzmine_df = mzmine_df[mzmine_df['row retention time']>min_rt]
@@ -378,7 +369,7 @@ def clean_adducts(x):
         new_x = ';'.join(
             [s.strip() for s in pd.unique(x.split(';'))]
             )
-        return x 
+        return x
 
     metatlas_atlas = pd.DataFrame()
     metatlas_atlas['label'] = mzmine_df.apply(lambda x: '%.4f@%.2f'%(x['row m/z'],x['row retention time']),axis=1)
@@ -393,8 +384,8 @@ def clean_adducts(x):
     metatlas_atlas['rt_max'] = mzmine_df[rt_min_cols].apply(lambda x: x.max(),axis=1)
     metatlas_atlas['inchi_key'] = None
     metatlas_atlas['detected_polarity'] = polarity
-    
-    #tuplize the 'Identification method' and 'Name' from adducts and fragments 
+
+    #tuplize the 'Identification method' and 'Name' from adducts and fragments
 
     # stick on the peak height columns
     pk_height = [col for col in list(mzmine_df) if 'Peak height' in col]
@@ -465,7 +456,7 @@ def make_task_and_job(params):#basedir,basename,polarity,files):
 
 def create_job_script(m):
     """
-    
+
     This is the first function that runs when a user initializes a new untargeted workflow
 
     """
@@ -514,7 +505,7 @@ def create_job_script(m):
         if not m['small_mzmine_done']:
             fid.write('%s\n'%job_cmd_filtered)
         # fid.write('%s\n'%second_python_string)
-        
+
 
     bad_words = ['qos', '-p','-C','-L','-t','-N']
     bad_time = '#SBATCH -t 24:00:00'
@@ -594,12 +585,12 @@ def get_files(groups,filename_substring,file_filters,is_group=False):
     return files
 
 
-    
+
 
 def clean_up_mzmine_dataframe(df):
     """
     remove a few stray columns and set index to metatlas like attributes.
-    
+
     this leaves a metatlas-like index and peak-height columns as the only thing remaining.
     """
     #df['rt_min'] = df['rt_peak'] - 0.2
@@ -608,7 +599,7 @@ def clean_up_mzmine_dataframe(df):
     index_columns = ['mz','rt_peak','label','mz_tolerance','rt_min','rt_max','inchi_key','detected_polarity','adduct_assignments']
     df.set_index(index_columns,inplace=True)
     df = df[sorted(df.columns)]
-    
+
     return df
 
 def rt_checker(met_data,atlas_df,compound_idx,params):
@@ -623,7 +614,7 @@ def rt_checker(met_data,atlas_df,compound_idx,params):
 
 def min_checker(met_data,atlas_df,compound_idx,params):
     """
-    looks forward and backward by rt_timespan and requires that the measured peak height be 
+    looks forward and backward by rt_timespan and requires that the measured peak height be
     greater than minima.
     """
     try:
@@ -711,7 +702,7 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
     # task.output_csv = os.path.join(basedir,'intermediate_results','%s_%s_filtered.csv'%(basename,task.polarity))
     task.output_workspace = os.path.join(basedir,project_name,'%s_%s.mzmine'%(basename,task.polarity))
     task.input_xml = os.path.join(basedir,'logs','%s_%s_filtered.xml'%(basename,task.polarity))
-    
+
     # peak_list_filename = os.path.join(basedir,'intermediate_results','%s_%s_formatted_peakfiltered.csv'%(basename,polarity))
     task.mzmine_launcher = get_latest_mzmine_binary()
 
@@ -728,25 +719,25 @@ def make_targeted_mzmine_job(basedir,basename,polarity,files):
 def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_tolerance=1e-4,noise_level=1e4,mz_tolerance=20,rt_tolerance=0.5):
     """
     Name suffix: Suffix to be added to the peak list name.
-    
-    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns. 
+
+    Peak list file: Path of the csv file containing the list of peaks to be detected. The csv file should have three columns.
     The first column should contain the expected M/Z, the second column the expected RT and the third the peak name. Each peak should be in a different row.
-    
+
     Field separator: Character(s) used to separate fields in the peak list file.
-    
+
     Ignore first line: Check to ignore the first line of peak list file.
-    
+
     Intensity tolerance: This value sets the maximum allowed deviation from expected shape of a peak in chromatographic direction.
-    
+
     Noise level: The minimum intensity level for a data point to be considered part of a chromatogram. All data points below this intensity level are ignored.
-    
+
     MZ Tolerance: Maximum allowed m/z difference to find the peak
-    
+
     RT tolerance: Maximum allowed retention time difference to find the peak
     """
     # Set the noise floor
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'TargetedPeakDetectionModule' in d['@method']][0]
-    
+
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Peak list file' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['#text'] = '%s'%peak_list_filename
 
@@ -766,11 +757,11 @@ def configure_targeted_peak_detection(new_d,peak_list_filename,intensity_toleran
 
 def configure_crop_filter(new_d,polarity,files):
     """
-    
+
     """
     # Set the noise floor
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CropFilterModule' in d['@method']][0]
-    
+
     # Set the filter string
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Raw data files' in d['@name']][0]
     if any(['FPS' in f for f in files]):
@@ -788,25 +779,25 @@ def configure_crop_filter(new_d,polarity,files):
 
 def configure_mass_detection(new_d,noise_floor,polarity):
     """
-    
+
     """
     # Set the noise floor
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'MassDetectionModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Mass detector' in d['@name']][0]
     idx3 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter'][idx2]['module']) if 'Centroid' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['module'][idx3]['parameter']['#text'] = '%.2f'%(noise_floor)
-    
+
     # Set the polarity
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['polarity'] = polarity.upper()
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ms_level'] = '1'
 
-    
+
     return new_d
 
 def configure_chromatogram_builder(new_d,min_peak_duration,min_peak_height,mz_tolerance,polarity,min_rt,max_rt):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'ChromatogramBuilderModule' in d['@method']][0]
     new_d['batch']['batchstep'][idx]['parameter']
@@ -819,12 +810,12 @@ def configure_chromatogram_builder(new_d,min_peak_duration,min_peak_height,mz_to
 
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['ppmtolerance'] = '%.3f'%(mz_tolerance)
-    
-   
+
+
     # Set the polarity
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Scans' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['polarity'] = polarity.upper()
-    
+
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['retention_time']['min'] = '%.3f'%min_rt
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['retention_time']['max'] = '%.3f'%max_rt
 
@@ -833,7 +824,7 @@ def configure_chromatogram_builder(new_d,min_peak_duration,min_peak_height,mz_to
 
 def configure_peak_deconvolution(new_d,min_peak_height,min_sn_ratio,min_peak_duration,max_peak_duration):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DeconvolutionModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Algorithm' in d['@name']][0]
@@ -851,17 +842,17 @@ def configure_peak_deconvolution(new_d,min_peak_height,min_sn_ratio,min_peak_dur
     idx4 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter'][idx2]['module'][idx3]['parameter']) if 'Peak duration range (min)' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['module'][idx3]['parameter'][idx4]['min'] = '%.3f'%min_peak_duration
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['module'][idx3]['parameter'][idx4]['max'] = '%.3f'%max_peak_duration
-    
+
     #following deconvolution, many small peaks are created.  Filter them out here
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'PeakFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Height' in d['@name']][0]
     new_d['batch']['batchstep'][idx]['parameter'][idx2]['min'] = '%.3f'%(min_peak_height)
-    
+
     return new_d
 
 def configure_isotope_adduct_fragment_search(new_d,mz_tolerance,rt_tol_perfile,polarity,min_peak_height):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'Isotope' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -906,7 +897,7 @@ def configure_isotope_adduct_fragment_search(new_d,mz_tolerance,rt_tol_perfile,p
 
 def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'JoinAlignerModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -917,7 +908,7 @@ def configure_join_aligner(new_d,mz_tolerance,rt_tol_multifile):
 
 def configure_duplicate_filter(new_d,mz_tolerance,rt_tol_perfile):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'DuplicateFilterModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -928,7 +919,7 @@ def configure_duplicate_filter(new_d,mz_tolerance,rt_tol_perfile):
 
 def configure_gap_filling(new_d,mz_tolerance):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'SameRangeGapFillerModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'm/z tolerance' in d['@name']][0]
@@ -937,7 +928,7 @@ def configure_gap_filling(new_d,mz_tolerance):
 
 def configure_workspace_output(new_d,output_workspace):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'ProjectSaveModule' in d['@method']][0]
     new_d['batch']['batchstep'][idx]['parameter']['#text'] = output_workspace
@@ -945,7 +936,7 @@ def configure_workspace_output(new_d,output_workspace):
 
 def configure_csv_output(new_d,output_csv):
     """
-    
+
     """
     idx = [i for i,d in enumerate(new_d['batch']['batchstep']) if 'CSVExportModule' in d['@method']][0]
     idx2 = [i for i,d in enumerate(new_d['batch']['batchstep'][idx]['parameter']) if 'Filename' in d['@name']][0]
@@ -1001,7 +992,7 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
     cp ../MZmine-2.24/startMZmine_NERSC_* .
     cd /project/projectdirs/metatlas/projects/
     chgrp -R metatlas mzmine_parameters
-    chmod -R 770 mzmine_parameters 
+    chmod -R 770 mzmine_parameters
     """
     mzmine_versions = glob.glob(os.path.join(BINARY_PATH,'*'))
     if version == 'most_recent':
@@ -1017,11 +1008,11 @@ def get_latest_mzmine_binary(system='Cori',version='most_recent'):
 def replace_files(d,file_list):
     """
     Replace files for mzmine task
-    
+
     Inputs:
     d: an xml derived dictionary of batch commands
     file_list: a list of full paths to mzML files
-    
+
     Outputs:
     d: an xml derived dict with new files in it
     """
@@ -1065,23 +1056,24 @@ def dict_to_etree(d):
     def _to_etree(d, root):
         if not d:
             pass
-        elif isinstance(d, six.string_types):
+        elif isinstance(d, str):
             root.text = d
         elif isinstance(d, dict):
             for k,v in d.items():
-                assert isinstance(k, six.string_types)
+                assert isinstance(k, str)
                 if k.startswith('#'):
-                    assert k == '#text' and isinstance(v, six.string_types)
+                    assert k == '#text' and isinstance(v, str)
                     root.text = v
                 elif k.startswith('@'):
-                    assert isinstance(v, six.string_types)
+                    assert isinstance(v, str)
                     root.set(k[1:], v)
                 elif isinstance(v, list):
                     for e in v:
                         _to_etree(e, ET.SubElement(root, k))
                 else:
                     _to_etree(v, ET.SubElement(root, k))
-        else: assert d == 'invalid type', (type(d), d)
+        else:
+            assert d == 'invalid type', (type(d), d)
     assert isinstance(d, dict) and len(d) == 1
     tag, body = next(iter(d.items()))
     node = ET.Element(tag)
diff --git a/noxfile.py b/noxfile.py
index 91c3360a..96001399 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -9,6 +9,7 @@
     "flake8",
     "black",
     "pylint-3.8",
+    "mypy-3.8",
     "unit_tests-3.8",
     "flake8_nb",
     "black_nb",
@@ -53,6 +54,15 @@
     "toml==0.10.2",
 ]
 
+mypy_deps = [
+    "mypy==0.910",
+    "types-PyYAML",
+    "types-requests",
+    "types-simplejson",
+    "types-six",
+    "types-tabulate",
+]
+
 pylint_deps = [
     "pylint==2.8.2",
     "pytest==6.2.4",  # so "import pytest" doesn't get reported
@@ -117,6 +127,12 @@ def blacken(session):
     session.run("black", *more_checks)
 
 
+@nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
+def mypy(session):
+    session.install("-r", "docker/requirements.txt", *mypy_deps)
+    session.run("mypy", "metatlas")
+
+
 @nox.session(python=py_versions, reuse_venv=REUSE_LARGE_VENV)
 def pylint(session):
     session.install("-r", "docker/requirements.txt", *pylint_deps)
diff --git a/pyproject.toml b/pyproject.toml
index 8793bcf8..c2949d43 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,46 @@
 [tool.black]
 line-length = 110
 
+[[tool.mypy.overrides]]
+module = [
+    "colorama.*",
+    "dataset.*",
+    "dill.*",
+    "gspread.*",
+    "humanize.*",
+    "h5py.*",
+    "ijson.*",
+    "IPython.*",
+    "ipywidgets.*",
+    "labkey.*",
+    "matplotlib.*",
+    "metatlas.h5_query.*",
+    "metatlas.helpers.*",
+    "metatlas.kbase.*",
+    "metatlas.metatlas_objects.*",
+    "metatlas.mzml_loader.*",
+    "metatlas.plotting.*",
+    "metatlas.object_helpers.*",
+    "networkx.*",
+    "numpy.fft.fftpack",
+    "numpy.testing.decorators",
+    "oauth2client.*",
+    "pandas.*",
+    "pathlib2.*",
+    "pexpect.*",
+    "PIL.*",
+    "pymzml.*",
+    "pyteomics.*",
+    "qgrid.*",
+    "rdkit.*",
+    "scipy.*",
+    "sklearn.*",
+    "tables.*",
+    "tqdm.*",
+    "traitlets.*"
+]
+ignore_missing_imports = true
+
 [tool.pylint.messages_control]
 # first two are for black compatibility
 # duplicate-code cannot be disabled on per file/block/line

From ff3aa8481223aa51e397147e3f745272f06b81c5 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 26 Aug 2021 13:24:34 -0700
Subject: [PATCH 110/177] Improve type annotations in metatlas_dataset.py

also move or_default to test/utils.py
---
 metatlas/datastructures/metatlas_dataset.py | 260 +++++++++++---------
 metatlas/plots/dill2plots.py                |  15 +-
 metatlas/tools/util.py                      |  10 +
 noxfile.py                                  |   1 +
 tests/unit/test_metatlas_dataset.py         |   2 +-
 5 files changed, 157 insertions(+), 131 deletions(-)
 create mode 100644 metatlas/tools/util.py

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index a86ba65e..d8779c22 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -6,14 +6,16 @@
 import os
 import shutil
 
-from typing import Any, Dict, List
+from typing import Any, Dict, List, NewType, Optional, Tuple, TypedDict
+from typing import cast
 
 import humanize
 import pandas as pd
 import traitlets
 
-from traitlets import HasTraits, TraitError, default, observe, validate
-from traitlets import Bool, Float, Instance, Int, Tuple, Unicode
+from traitlets import TraitError, default, observe, validate
+from traitlets import Bool, Float, HasTraits, Instance, Int, TraitType, Unicode
+from traitlets.traitlets import ObserveHandler
 
 from metatlas.datastructures import metatlas_objects as metob
 from metatlas.datastructures import object_helpers as metoh
@@ -22,44 +24,61 @@
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
 from metatlas.tools import parallel
-
-MSMS_REFS_PATH: str = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
-DEFAULT_GROUPS_CONTROLLED_VOCAB: List[str] = ["QC", "InjBl", "ISTD"]
-OUTPUT_TYPES: List[str] = ["ISTDsEtc", "FinalEMA-HILIC", "data_QC"]
-POLARITIES: List[str] = ["positive", "negative", "fast-polarity-switching"]
-SHORT_POLARITIES: Dict[str, str] = {"positive": "POS", "negative": "NEG", "fast-polarity-switching": "FPS"}
+from metatlas.tools.util import or_default
+
+FileMatchList = NewType('FileMatchList', List[str])
+GroupMatchList = NewType('GroupMatchList', List[str])
+GroupList = NewType('GroupList', Optional[List[metob.Group]])
+LcmsRunsList = NewType('LcmsRunsList', Optional[List[metob.LcmsRun]])
+Polarity = NewType('Polarity', str)
+ShortPolarity = NewType('ShortPolarity', str)
+Experiment = NewType('Experiment', str)
+OutputType = NewType('OutputType', str)
+AnalysisNumber = NewType('AnalysisNumber', int)
+AtlasName = NewType('AtlasName', str)
+PathString = NewType('PathString', str)
+Username = NewType('Username', str)
+
+MSMS_REFS_PATH = PathString("/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab")
+DEFAULT_GROUPS_CONTROLLED_VOCAB = GroupMatchList(["QC", "InjBl", "ISTD"])
+OUTPUT_TYPES = [OutputType("ISTDsEtc"), OutputType("FinalEMA-HILIC"), OutputType("data_QC")]
+POLARITIES = [Polarity("positive"), Polarity("negative"), Polarity("fast-polarity-switching")]
+SHORT_POLARITIES = {Polarity("positive"): ShortPolarity("POS"), Polarity("negative"): ShortPolarity("NEG"), Polarity("fast-polarity-switching"): ShortPolarity("FPS")}
 
 logger = logging.getLogger(__name__)
 
 
+class Proposal(TypedDict):
+    """ for use with traitlets.validate """
+    owner: HasTraits
+    value: object
+    trait: TraitType
+
+
 class AnalysisIdentifiers(HasTraits):
     """Names used in generating an analysis"""
-
-    source_atlas: str = Unicode(allow_none=True, default_value=None)
-    experiment: str = Unicode()
-    output_type: str = Unicode()
-    polarity: str = Unicode(default_value="positive")
-    analysis_number: int = Int(default_value=0)
-    username: str = Unicode(default_value=getpass.getuser())
-    project_directory: str = Unicode()
+    source_atlas: Optional[AtlasName] = Unicode(allow_none=True, default_value=None)
+    experiment: Experiment = Unicode()
+    output_type: OutputType = Unicode()
+    polarity: Polarity = Unicode(default_value="positive")
+    analysis_number: AnalysisNumber = Int(default_value=0)
+    username: Username = Unicode(default_value=getpass.getuser())
+    project_directory: PathString = Unicode()
     google_folder: str = Unicode()
-    exclude_files: List[str] = traitlets.List(trait=Unicode(), allow_none=True, default_value=[])
-    include_groups: List[str] = traitlets.List(allow_none=True, default_value=None)
-    exclude_groups: List[str] = traitlets.List(allow_none=True, default_value=None)
-    groups_controlled_vocab: List[str] = traitlets.List(
-        trait=Unicode(), allow_none=True, default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
+    exclude_files: FileMatchList = traitlets.List(trait=Unicode(), default_value=[])
+    include_groups: GroupMatchList = traitlets.List()
+    exclude_groups: GroupMatchList = traitlets.List()
+    groups_controlled_vocab: GroupMatchList = traitlets.List(
+        trait=Unicode(), default_value=DEFAULT_GROUPS_CONTROLLED_VOCAB
     )
-    _lcmsruns: List[metob.LcmsRun] = traitlets.List(allow_none=True, default_value=None)
-    _all_groups: List[metob.Group] = traitlets.List(allow_none=True, default_value=None)
-    _groups: List[metob.Group] = traitlets.List(allow_none=True, default_value=None)
+    _lcmsruns: LcmsRunsList = traitlets.List(allow_none=True, default_value=None)
+    _all_groups: GroupList = traitlets.List(allow_none=True, default_value=None)
+    _groups: GroupList = traitlets.List(allow_none=True, default_value=None)
 
     # pylint: disable=no-self-use
     def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
-        if self.polarity == "positive":
-            self.exclude_groups.append("NEG")
-        elif self.polarity == "negative":
-            self.exclude_groups.append("POS")
+        self.exclude_groups = append_inverse(self.exclude_groups, self.polarity)
         logger.info(
             "IDs: source_atlas=%s, atlas=%s, short_experiment_analysis=%s, output_dir=%s",
             self.source_atlas,
@@ -70,14 +89,14 @@ def __init__(self, **kwargs) -> None:
         self.store_all_groups(exist_ok=True)
 
     @default("include_groups")
-    def _default_include_groups(self) -> List[str]:
+    def _default_include_groups(self) -> List[OutputType]:
         if self.output_type == "data_QC":
-            return ["QC"]
+            return [OutputType("QC")]
         return []
 
     @default("exclude_groups")
-    def _default_exclude_groups(self) -> List[str]:
-        out: List[str] = ["InjBl", "InjBL"]
+    def _default_exclude_groups(self) -> GroupMatchList:
+        out: GroupMatchList = ["InjBl", "InjBL"]
         if self.output_type != "data_QC":
             out.append("QC")
         if self.polarity == "positive":
@@ -87,89 +106,92 @@ def _default_exclude_groups(self) -> List[str]:
         return out
 
     @validate("polarity")
-    def _valid_polarity(self, proposal: Dict[str, Any]) -> str:
+    def _valid_polarity(self, proposal: Proposal) -> Polarity:
         if proposal["value"] not in POLARITIES:
             raise TraitError(f"Parameter polarity must be one of {', '.join(POLARITIES)}")
-        return proposal["value"]
+        return cast(Polarity, proposal["value"])
 
     @validate("output_type")
-    def _valid_output_type(self, proposal: Dict[str, Any]) -> str:
+    def _valid_output_type(self, proposal: Proposal) -> OutputType:
         if proposal["value"] not in OUTPUT_TYPES:
             raise TraitError(f"Parameter output_type must be one of {', '.join(OUTPUT_TYPES)}")
-        return proposal["value"]
+        return cast(OutputType, proposal["value"])
 
     @validate("source_atlas")
-    def _valid_source_atlas(self, proposal: Dict[str, Any]) -> str:
+    def _valid_source_atlas(self, proposal: Proposal) -> AtlasName:
         if proposal["value"] is not None:
             try:
                 get_atlas(proposal["value"], self.username)  # raises error if not found or matches multiple
             except ValueError as err:
                 raise TraitError(str(err)) from err
-        return proposal["value"]
+        return cast(AtlasName, proposal["value"])
 
     @validate("analysis_number")
-    def _valid_analysis_number(self, proposal: Dict[str, Any]) -> int:
-        if proposal["value"] < 0:
+    def _valid_analysis_number(self, proposal: Proposal) -> AnalysisNumber:
+        value = cast(AnalysisNumber, proposal["value"])
+        if value < 0:
             raise TraitError("Parameter analysis_number cannot be negative.")
-        return proposal["value"]
+        return value
 
     @validate("experiment")
-    def _valid_experiment(self, proposal):
-        if len(proposal["value"].split("_")) != 9:
+    def _valid_experiment(self, proposal: Proposal) -> Experiment:
+        value = cast(str, proposal["value"])
+        if len(value.split("_")) != 9:
             raise TraitError('Parameter experiment does contain 9 fields when split on "_".')
-        return proposal["value"]
+        return cast(Experiment, value)
 
     @property
-    def _exp_tokens(self):
+    def _exp_tokens(self) -> List[str]:
         """Returns list of strings from the experiment name"""
         return self.experiment.split("_")
 
     @property
-    def project(self):
-        """Returns project number (proposal id)"""
-        return self._exp_tokens[3]
+    def project(self) -> int:
+        """Returns project number (proposal id) """
+        return int(self._exp_tokens[3])
 
     @property
-    def atlas(self):
+    def atlas(self) -> AtlasName:
         """Atlas identifier (name)"""
-        return f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
+        return AtlasName(f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}")
 
     @property
-    def analysis(self):
+    def analysis(self) -> str:
         """Analysis identifier"""
         return f"{self.username}{self.analysis_number}"
 
     @property
-    def short_experiment_analysis(self):
+    def short_experiment_analysis(self) -> str:
         """Short experiment analysis identifier"""
         return f"{self._exp_tokens[0]}_{self._exp_tokens[3]}_{self.output_type}_{self.analysis}"
 
     @property
-    def short_polarity(self):
+    def short_polarity(self) -> ShortPolarity:
         """Short polarity identifier: 3 letters, upper case"""
         return SHORT_POLARITIES[self.polarity]
 
     @property
-    def short_polarity_inverse(self):
+    def short_polarity_inverse(self) -> List[ShortPolarity]:
         """Returns the short_polarity values not used in this analysis"""
         return list(set(SHORT_POLARITIES.values()) - {self.short_polarity})
 
     @property
-    def output_dir(self):
+    def output_dir(self) -> PathString:
         """Creates the output directory and returns the path as a string"""
         out = os.path.join(self.project_directory, self.experiment, self.analysis, self.output_type)
         os.makedirs(out, exist_ok=True)
-        return out
+        return PathString(out)
 
     @property
-    def lcmsruns(self):
+    def lcmsruns(self) -> List[metob.LcmsRun]:
         """Get LCMS runs from DB matching experiment"""
         if self._lcmsruns is not None:
             return self._lcmsruns
         all_lcmsruns = dp.get_metatlas_files(experiment=self.experiment, name="%")
-        if len(self.exclude_files) > 0:
+        if self.exclude_files is not None and len(self.exclude_files) > 0:
             self._lcmsruns = [
-                r for r in all_lcmsruns if not any(map(r.name.__contains__, self.exclude_files))
+                r for r in all_lcmsruns
+                if not any(map(r.name.__contains__, or_default(self.exclude_files, [])))
             ]
             logger.info(
                 "Excluding %d LCMS runs containing any of: %s",
@@ -184,11 +206,11 @@ def lcmsruns(self):
         return self._lcmsruns
 
     @property
-    def lcmsruns_dataframe(self):
+    def lcmsruns_dataframe(self) -> pd.DataFrame:
         """Returns a pandas DataFrame with lcmsrun matching self.experiment"""
         return metob.to_dataframe(self.lcmsruns)
 
-    def get_lcmsruns_short_names(self, fields=None):
+    def get_lcmsruns_short_names(self, fields: Optional[Dict[str, List[int]]] = None) -> pd.DataFrame:
         """
         Querys DB for lcms filenames from self.experiment and returns
         a pandas DataFrame containing identifiers for each file
@@ -198,7 +220,7 @@ def get_lcmsruns_short_names(self, fields=None):
         """
         if fields is None:
             fields = {
-                "full_filename": range(16),
+                "full_filename": list(range(16)),
                 "sample_treatment": [12],
                 "short_filename": [0, 2, 4, 5, 7, 9, 14],
                 "short_samplename": [9, 12, 13, 14],
@@ -217,9 +239,9 @@ def get_lcmsruns_short_names(self, fields=None):
         out.set_index("full_filename", inplace=True)
         return out.sort_values(by="full_filename")
 
-    lcmsruns_short_names = property(get_lcmsruns_short_names)
+    lcmsruns_short_names: pd.DataFrame = property(get_lcmsruns_short_names)
 
-    def write_lcmsruns_short_names(self):
+    def write_lcmsruns_short_names(self) -> None:
         """Write short names and raise error if exists and differs from current data"""
         short_names = self.lcmsruns_short_names
         short_names["full_filename"] = short_names.index
@@ -231,87 +253,85 @@ def write_lcmsruns_short_names(self):
         )
 
     @property
-    def _files_dict(self):
+    def _files_dict(self) -> Dict[str, Dict[str, Any]]:
         """
         Queries DB for all lcmsruns matching the class properties.
         Returns a dict of dicts where keys are filenames minus extensions and values are
         dicts with keys: object, group, and short_name
         """
-        file_dict = {}
+        file_dict: Dict[str, Dict[str, Any]] = {}
         for lcms_file in self.lcmsruns:
-            base_name = lcms_file.name.split(".")[0]
+            base_name: str = lcms_file.name.split(".")[0]
             file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
         return file_dict
 
     @property
-    def groups(self):
+    def groups(self) -> List[metob.Group]:
         """Return the currently selected groups"""
         if self._groups is not None:
             return self._groups
         out = dp.filter_metatlas_objects_to_most_recent(self.all_groups, "name")
-        if len(self.include_groups) > 0:
+        if self.include_groups is not None and len(self.include_groups) > 0:
             out = dp.filter_metatlas_objects_by_list(out, "name", self.include_groups)
-        if len(self.exclude_groups) > 0:
+        if self.exclude_groups is not None and len(self.exclude_groups) > 0:
             out = dp.remove_metatlas_objects_by_list(out, "name", self.exclude_groups)
         self._groups = dp.filter_empty_metatlas_objects(out, "items")
         return self._groups
 
     @observe("polarity")
-    def _observe_polarity(self, signal):
+    def _observe_polarity(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
-            if signal.new == "positive":
-                self.exclude_groups.append("NEG")
-            elif signal.new == "negative":
-                self.exclude_groups.append("POS")
+            self.exclude_groups = append_inverse(self.exclude_groups, signal.new)
             logger.debug("Change to polarity invalidates exclude_groups")
 
     @observe("_all_groups")
-    def _observe_all_groups(self, signal):
+    def _observe_all_groups(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._groups = None
             logger.debug("Change to all_groups invalidates groups")
 
     @observe("groups_controlled_vocab")
-    def _observe_groups_controlled_vocab(self, signal):
+    def _observe_groups_controlled_vocab(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._lcmsruns = None
             logger.debug("Change to groups_controlled_vocab invalidates lcmsruns")
 
     @observe("include_groups")
-    def _observe_include_groups(self, signal):
+    def _observe_include_groups(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._groups = None
             logger.debug("Change to include_groups invalidates groups")
 
     @observe("exclude_groups")
-    def _observe_exclude_groups(self, signal):
+    def _observe_exclude_groups(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._groups = None
             logger.debug("Change to exclude_groups invalidates groups")
 
     @observe("exclude_files")
-    def _observe_exclude_files(self, signal):
+    def _observe_exclude_files(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._lcmsruns = None
             logger.debug("Change to exclude_files invalidates lcmsruns")
 
     @observe("_lcmsruns")
-    def _observe_lcmsruns(self, signal):
+    def _observe_lcmsruns(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._all_groups = None
             logger.debug("Change to lcmsruns invalidates all_groups")
 
     @property
-    def existing_groups(self):
+    def existing_groups(self) -> List[metob.Group]:
         """Get your own groups that are prefixed by self.experiment"""
         return metob.retrieve("Groups", name=f"{self.experiment}%{self.analysis}_%", username=self.username)
 
-    def group_name(self, base_filename):
+    def group_name(self, base_filename: str) -> Dict[str, str]:
         """Returns dict with keys group and short_name corresponding to base_filename"""
         tokens = base_filename.split("_")
         prefix = "_".join(tokens[:11])
         indices = [
-            i for i, s in enumerate(self.groups_controlled_vocab) if s.lower() in base_filename.lower()
+            i for i, s in enumerate(or_default(self.groups_controlled_vocab, []))
+            if s.lower() in base_filename.lower()
         ]
         suffix = self.groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
         group_name = f"{prefix}_{self.analysis}_{suffix}"
@@ -319,7 +339,7 @@ def group_name(self, base_filename):
         return {"group": group_name, "short_name": short_name}
 
     @property
-    def all_groups_dataframe(self):
+    def all_groups_dataframe(self) -> pd.DataFrame:
         """Returns pandas Dataframe with one row per file"""
         out = pd.DataFrame(self._files_dict).T
         if out.empty:
@@ -329,7 +349,7 @@ def all_groups_dataframe(self):
         return out.reset_index()
 
     @property
-    def all_groups(self):
+    def all_groups(self) -> List[metob.Group]:
         """Returns a list of Group objects"""
         if self._all_groups is not None:
             return self._all_groups
@@ -349,7 +369,7 @@ def all_groups(self):
             )
         return self._all_groups
 
-    def store_all_groups(self, exist_ok=False):
+    def store_all_groups(self, exist_ok: bool = False) -> None:
         """
         Save self.object_list to DB
         inputs:
@@ -373,6 +393,23 @@ def store_all_groups(self, exist_ok=False):
         metob.store(self.all_groups)
 
 
+class MetatlasSample:
+    """
+    Object oriented interface to second level of metatlas_dataset. Each instance is one sample (LCMS run).
+    """
+
+    def __init__(self, data):
+        self._data = data
+
+    def __getitem__(self, idx):
+        """get sample at idx"""
+        return self._data[idx]
+
+    def __len__(self):
+        """len is from data"""
+        return len(self._data)
+
+
 class MetatlasDataset(HasTraits):
     """
     Like the non-object oriented metatlas_dataset, you can index into this class by file_idx and compound_idx:
@@ -398,18 +435,18 @@ class MetatlasDataset(HasTraits):
     save_metadata: if True, write metadata files containing data sources and LCMS runs short name
     """
 
-    extra_time = Float(default_value=0.75)
-    extra_mz = Float(default_value=0)
-    frag_mz_tolerance = Float(default_value=0.01)
-    max_cpus = Int(default_value=1)
-    save_metadata = Bool(default_value=True)
-    keep_nonmatches = Bool(default_value=True)
-    msms_refs_loc = Unicode(default_value=MSMS_REFS_PATH)
-    ids = Instance(klass=AnalysisIdentifiers)
-    atlas = Instance(klass=metob.Atlas, allow_none=True, default_value=None)
-    _atlas_df = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
-    _data = Tuple(allow_none=True, default_value=None)
-    _hits = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
+    extra_time: float = Float(default_value=0.75)
+    extra_mz: float = Float(default_value=0)
+    frag_mz_tolerance: float = Float(default_value=0.01)
+    max_cpus: int = Int(default_value=1)
+    save_metadata: bool = Bool(default_value=True)
+    keep_nonmatches: bool = Bool(default_value=True)
+    msms_refs_loc: str = Unicode(default_value=MSMS_REFS_PATH)
+    ids: AnalysisIdentifiers = Instance(klass=AnalysisIdentifiers)
+    atlas: Optional[metob.Atlas] = Instance(klass=metob.Atlas, allow_none=True, default_value=None)
+    _atlas_df: Optional[pd.DataFrame] = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
+    _data: Optional[Tuple[MetatlasSample, ...]] = traitlets.Tuple(allow_none=True, default_value=None)
+    _hits: Optional[pd.DataFrame] = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
 
     # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods, no-self-use
     def __init__(self, **kwargs):
@@ -863,23 +900,6 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         targeted_output.copy_outputs_to_google_drive(self.ids)
 
 
-class MetatlasSample:
-    """
-    Object oriented interface to second level of metatlas_dataset. Each instance is one sample (LCMS run).
-    """
-
-    def __init__(self, data):
-        self._data = data
-
-    def __getitem__(self, idx):
-        """get sample at idx"""
-        return self._data[idx]
-
-    def __len__(self):
-        """len is from data"""
-        return len(self._data)
-
-
 def _duration_since(start):
     """
     inputs:
@@ -964,3 +984,9 @@ def get_atlas(name, username):
 def quoted_string_list(strings):
     """Adds double quotes around each string and seperates with ', '."""
     return ", ".join([f'"{x}"' for x in strings])
+
+
+def append_inverse(in_list: List[str], polarity: str):
+    """ appends short version of inverse of polarity to and retuns the list """
+    inverse = {'positive': 'NEG', 'negative': 'POS'}
+    return in_list + [inverse[polarity]] if polarity in inverse.keys() else in_list
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 53de0b54..a5fbac0e 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-from __future__ import print_function
 import logging
 import sys
 import os
@@ -50,9 +48,9 @@
 # from oauth2client.client import SignedJwtAssertionCredentials
 from oauth2client.service_account import ServiceAccountCredentials
 
+from metatlas.tools.util import or_default
+
 import six
-from six.moves import range
-from six.moves import zip
 from functools import reduce
 from io import StringIO
 
@@ -3275,12 +3273,3 @@ def rt_range_overlaps(rt1, rt2):
     """
     return ((rt2.rt_min <= rt1.rt_min <= rt2.rt_max) or (rt2.rt_min <= rt1.rt_max <= rt2.rt_max) or
             (rt1.rt_min <= rt2.rt_min <= rt1.rt_max) or (rt1.rt_min <= rt2.rt_max <= rt1.rt_max))
-
-
-def or_default(none_or_value, default):
-    """
-    inputs:
-        none_or_value: variable to test
-        default: value to return if none_or_value is None
-    """
-    return none_or_value if none_or_value is not None else default
diff --git a/metatlas/tools/util.py b/metatlas/tools/util.py
new file mode 100644
index 00000000..960f5584
--- /dev/null
+++ b/metatlas/tools/util.py
@@ -0,0 +1,10 @@
+""" stand alone utility functions """
+
+
+def or_default(none_or_value, default):
+    """
+    inputs:
+        none_or_value: variable to test
+        default: value to return if none_or_value is None
+    """
+    return none_or_value if none_or_value is not None else default
diff --git a/noxfile.py b/noxfile.py
index 96001399..235530d3 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -30,6 +30,7 @@
     "metatlas/tools/notebook.py",
     "metatlas/tools/predict_rt.py",
     "metatlas/tools/parallel.py",
+    "metatlas/tools/util.py",
     "tests",
 ]
 
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 1fb9eb64..ff9e6b8e 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -676,7 +676,7 @@ def test_include_groups01(sqlite_with_atlas, username, lcmsrun, mocker, groups_c
 
 
 def test_project01(analysis_ids):
-    assert analysis_ids.project == "505892"
+    assert analysis_ids.project == 505892
 
 
 def test_exclude_files01(analysis_ids):

From 6b8d908b1178297114b0608df35fe18bab79ac34 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 27 Aug 2021 23:08:07 -0700
Subject: [PATCH 111/177] improve group handling in output generation

---
 metatlas/tools/predict_rt.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 08f82108..7156d514 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -106,10 +106,10 @@ def generate_rt_correction_models(
         model_only: If True, do not create atlases or notebooks, if False create them
     """
     # pylint: disable=too-many-locals
-    metatlas_dataset = mads.MetatlasDataset(ids=ids, save_metadata=False)
-    groups = get_groups(metatlas_dataset)
+    groups = get_groups(ids)
     files_df = get_files_df(groups)
-    qc_atlas, qc_atlas_df = get_qc_atlas(metatlas_dataset.ids)
+    qc_atlas, qc_atlas_df = get_qc_atlas(ids)
+    # this metatlas_dataset is not a class instance. Only has metatlas_dataset[file_idx][compound_idx]...
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
     if len(metatlas_dataset) == 0:
         logger.error("No matching LCMS runs, terminating without generating outputs.")
@@ -137,15 +137,13 @@ def generate_rt_correction_models(
     logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")
 
 
-def get_groups(metatlas_dataset):
+def get_groups(ids):
     """
     Create all experiment groups if they don't already exist and return the subset matching include_list
     inputs:
-        metatlas_datset: instance of MetatlasDataset
-        include_groups: group will only be used in correction if their name has a substring match
-                        to this list of strings
+        ids: instance of AnalysisIds
     """
-    ordered_groups = sorted(metatlas_dataset.ids.groups, key=lambda x: x.name)
+    ordered_groups = sorted(ids.groups, key=lambda x: x.name)
     for grp in ordered_groups:
         logger.info("Selected group: %s, %s", grp.name, int_to_date_str(grp.last_modified))
     return ordered_groups

From edcbc281c613b61b56658bdd01dfc83bbeb3aa33 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 27 Aug 2021 23:10:30 -0700
Subject: [PATCH 112/177] add more type annotations to metatlas_dataset.py

---
 metatlas/datastructures/metatlas_dataset.py | 344 +++++++++++++-------
 tests/unit/test_metatlas_dataset.py         |   6 +
 2 files changed, 238 insertions(+), 112 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index d8779c22..78f37c5a 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -1,4 +1,6 @@
 """ object oriented interface to metatlas_dataset """
+# pylint: disable=too-many-lines
+
 import datetime
 import getpass
 import glob
@@ -6,8 +8,7 @@
 import os
 import shutil
 
-from typing import Any, Dict, List, NewType, Optional, Tuple, TypedDict
-from typing import cast
+from typing import cast, Any, Dict, List, NewType, Optional, Tuple, TypedDict, Union
 
 import humanize
 import pandas as pd
@@ -26,37 +27,111 @@
 from metatlas.tools import parallel
 from metatlas.tools.util import or_default
 
-FileMatchList = NewType('FileMatchList', List[str])
-GroupMatchList = NewType('GroupMatchList', List[str])
-GroupList = NewType('GroupList', Optional[List[metob.Group]])
-LcmsRunsList = NewType('LcmsRunsList', Optional[List[metob.LcmsRun]])
-Polarity = NewType('Polarity', str)
-ShortPolarity = NewType('ShortPolarity', str)
-Experiment = NewType('Experiment', str)
-OutputType = NewType('OutputType', str)
-AnalysisNumber = NewType('AnalysisNumber', int)
-AtlasName = NewType('AtlasName', str)
-PathString = NewType('PathString', str)
-Username = NewType('Username', str)
-
-MSMS_REFS_PATH = PathString("/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab")
-DEFAULT_GROUPS_CONTROLLED_VOCAB = GroupMatchList(["QC", "InjBl", "ISTD"])
+GroupList = Optional[List[metob.Group]]
+LcmsRunsList = Optional[List[metob.LcmsRun]]
+FileMatchList = List[str]
+GroupMatchList = List[str]
+
+Polarity = NewType("Polarity", str)
+ShortPolarity = NewType("ShortPolarity", str)
+Experiment = NewType("Experiment", str)
+OutputType = NewType("OutputType", str)
+AnalysisNumber = NewType("AnalysisNumber", int)
+AtlasName = NewType("AtlasName", str)
+PathString = NewType("PathString", str)
+Username = NewType("Username", str)
+
+MSMS_REFS_PATH = PathString(
+    "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+)
+DEFAULT_GROUPS_CONTROLLED_VOCAB = cast(GroupMatchList, ["QC", "InjBl", "ISTD"])
 OUTPUT_TYPES = [OutputType("ISTDsEtc"), OutputType("FinalEMA-HILIC"), OutputType("data_QC")]
 POLARITIES = [Polarity("positive"), Polarity("negative"), Polarity("fast-polarity-switching")]
-SHORT_POLARITIES = {Polarity("positive"): ShortPolarity("POS"), Polarity("negative"): ShortPolarity("NEG"), Polarity("fast-polarity-switching"): ShortPolarity("FPS")}
+SHORT_POLARITIES = {
+    Polarity("positive"): ShortPolarity("POS"),
+    Polarity("negative"): ShortPolarity("NEG"),
+    Polarity("fast-polarity-switching"): ShortPolarity("FPS"),
+}
 
 logger = logging.getLogger(__name__)
 
 
 class Proposal(TypedDict):
-    """ for use with traitlets.validate """
+    """for use with traitlets.validate"""
+
     owner: HasTraits
     value: object
     trait: TraitType
 
 
+class _LcmsRunDict(TypedDict):
+    """part of return type for AnalysisIds._files_dict"""
+
+    object: metob.LcmsRun
+    group: str
+    short_name: str
+
+
+class MsSummary(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    num_ms1_datapoints: int
+    mz_peak: float
+    rt_peak: float
+    mz_centroid: float
+    rt_centroid: float
+    peak_height: float
+    peak_area: float
+
+
+class Eic(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    mz: List[float]
+    rt: List[float]
+    intensity: List[float]
+
+
+class MsmsDataDict(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    mz: List[float]
+    i: List[float]
+    rt: List[float]
+    polarity: List[float]
+    precursor_MZ: List[float]
+    precursor_intensity: List[float]
+    collision_energy: List[float]
+
+
+class MsmsDict(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    data: MsmsDataDict
+
+
+class MsDataDict(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    msms: MsmsDict
+    eic: Eic
+    ms1_summary: MsSummary
+
+
+class CompoundDict(TypedDict):
+    """part of MetatlasDataset._data"""
+
+    atlas_name: AtlasName
+    atlas_unique_id: str
+    lcmsrun: metob.LcmsRun
+    group: metob.Group
+    identification: metob.CompoundIdentification
+    data: MsDataDict
+
+
 class AnalysisIdentifiers(HasTraits):
     """Names used in generating an analysis"""
+
     source_atlas: Optional[AtlasName] = Unicode(allow_none=True, default_value=None)
     experiment: Experiment = Unicode()
     output_type: OutputType = Unicode()
@@ -99,11 +174,7 @@ def _default_exclude_groups(self) -> GroupMatchList:
         out: GroupMatchList = ["InjBl", "InjBL"]
         if self.output_type != "data_QC":
             out.append("QC")
-        if self.polarity == "positive":
-            out.append("NEG")
-        elif self.polarity == "negative":
-            out.append("POS")
-        return out
+        return append_inverse(out, self.polarity)
 
     @validate("polarity")
     def _valid_polarity(self, proposal: Proposal) -> Polarity:
@@ -118,13 +189,15 @@ def _valid_output_type(self, proposal: Proposal) -> OutputType:
         return cast(OutputType, proposal["value"])
 
     @validate("source_atlas")
-    def _valid_source_atlas(self, proposal: Proposal) -> AtlasName:
+    def _valid_source_atlas(self, proposal: Proposal) -> Optional[AtlasName]:
         if proposal["value"] is not None:
+            proposed_name = cast(AtlasName, proposal["value"])
             try:
-                get_atlas(proposal["value"], self.username)  # raises error if not found or matches multiple
+                get_atlas(proposed_name, self.username)  # raises error if not found or matches multiple
             except ValueError as err:
                 raise TraitError(str(err)) from err
-        return cast(AtlasName, proposal["value"])
+            return proposed_name
+        return None
 
     @validate("analysis_number")
     def _valid_analysis_number(self, proposal: Proposal) -> AnalysisNumber:
@@ -147,13 +220,15 @@ def _exp_tokens(self) -> List[str]:
 
     @property
     def project(self) -> int:
-        """Returns project number (proposal id) """
+        """Returns project number (proposal id)"""
         return int(self._exp_tokens[3])
 
     @property
     def atlas(self) -> AtlasName:
         """Atlas identifier (name)"""
-        return AtlasName(f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}")
+        return AtlasName(
+            f"{'_'.join(self._exp_tokens[3:6])}_{self.output_type}_{self.short_polarity}_{self.analysis}"
+        )
 
     @property
     def analysis(self) -> str:
@@ -190,7 +265,8 @@ def lcmsruns(self) -> List[metob.LcmsRun]:
         all_lcmsruns = dp.get_metatlas_files(experiment=self.experiment, name="%")
         if self.exclude_files is not None and len(self.exclude_files) > 0:
             self._lcmsruns = [
-                r for r in all_lcmsruns
+                r
+                for r in all_lcmsruns
                 if not any(map(r.name.__contains__, or_default(self.exclude_files, [])))
             ]
             logger.info(
@@ -253,16 +329,16 @@ def write_lcmsruns_short_names(self) -> None:
         )
 
     @property
-    def _files_dict(self) -> Dict[str, Dict[str, Any]]:
+    def _files_dict(self) -> Dict[str, _LcmsRunDict]:
         """
         Queries DB for all lcmsruns matching the class properties.
         Returns a dict of dicts where keys are filenames minus extensions and values are
         dicts with keys: object, group, and short_name
         """
-        file_dict: Dict[str, Dict[str, Any]] = {}
+        file_dict: Dict[str, _LcmsRunDict] = {}
         for lcms_file in self.lcmsruns:
             base_name: str = lcms_file.name.split(".")[0]
-            file_dict[base_name] = {"object": lcms_file, **self.group_name(base_name)}
+            file_dict[base_name] = cast(_LcmsRunDict, {"object": lcms_file, **self.group_name(base_name)})
         return file_dict
 
     @property
@@ -330,7 +406,8 @@ def group_name(self, base_filename: str) -> Dict[str, str]:
         tokens = base_filename.split("_")
         prefix = "_".join(tokens[:11])
         indices = [
-            i for i, s in enumerate(or_default(self.groups_controlled_vocab, []))
+            i
+            for i, s in enumerate(or_default(self.groups_controlled_vocab, []))
             if s.lower() in base_filename.lower()
         ]
         suffix = self.groups_controlled_vocab[indices[0]].lstrip("_") if indices else tokens[12]
@@ -398,16 +475,16 @@ class MetatlasSample:
     Object oriented interface to second level of metatlas_dataset. Each instance is one sample (LCMS run).
     """
 
-    def __init__(self, data):
-        self._data = data
+    def __init__(self, compounds: Tuple[CompoundDict, ...]) -> None:
+        self.compounds: Tuple[CompoundDict, ...] = compounds
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> CompoundDict:
         """get sample at idx"""
-        return self._data[idx]
+        return self.compounds[idx]
 
-    def __len__(self):
+    def __len__(self) -> int:
         """len is from data"""
-        return len(self._data)
+        return len(self.compounds)
 
 
 class MetatlasDataset(HasTraits):
@@ -441,15 +518,15 @@ class MetatlasDataset(HasTraits):
     max_cpus: int = Int(default_value=1)
     save_metadata: bool = Bool(default_value=True)
     keep_nonmatches: bool = Bool(default_value=True)
-    msms_refs_loc: str = Unicode(default_value=MSMS_REFS_PATH)
+    msms_refs_loc: PathString = Unicode(default_value=MSMS_REFS_PATH)
     ids: AnalysisIdentifiers = Instance(klass=AnalysisIdentifiers)
-    atlas: Optional[metob.Atlas] = Instance(klass=metob.Atlas, allow_none=True, default_value=None)
+    atlas: metob.Atlas = Instance(klass=metob.Atlas)
     _atlas_df: Optional[pd.DataFrame] = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
     _data: Optional[Tuple[MetatlasSample, ...]] = traitlets.Tuple(allow_none=True, default_value=None)
     _hits: Optional[pd.DataFrame] = Instance(klass=pd.DataFrame, allow_none=True, default_value=None)
 
     # pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-public-methods, no-self-use
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs) -> None:
         """Constructor"""
         super().__init__(**kwargs)
         logger.debug("Creating new MetatlasDataset instance...")
@@ -461,7 +538,7 @@ def __init__(self, **kwargs):
             self.write_data_source_files()
             self.ids.write_lcmsruns_short_names()
 
-    def write_data_source_files(self):
+    def write_data_source_files(self) -> None:
         """Write the data source files if they don't already exist"""
         data_sources_dir = os.path.join(self.ids.output_dir, f"{self.ids.short_polarity}_data_sources")
         if len(glob.glob(os.path.join(data_sources_dir, "*"))) >= 4:
@@ -478,7 +555,7 @@ def write_data_source_files(self):
                 self.ids.groups, self.atlas, self.ids.output_dir, self.ids.short_polarity
             )
 
-    def _get_atlas(self):
+    def _get_atlas(self) -> None:
         """
         Copy source atlas from database into current analysis atlas
         If the atlas does not yet exist, it will be copied from source_atlas and there will be an
@@ -507,22 +584,31 @@ def _get_atlas(self):
             except ValueError as err:
                 logger.exception(err)
                 raise err
+        elif self.ids.source_atlas is not None:
+            self.atlas = self._clone_source_atlas()
         else:
-            logger.info("Retriving source atlas: %s", self.ids.source_atlas)
-            source_atlas = get_atlas(self.ids.source_atlas, self.ids.username)
-            source_atlas_df = ma_data.make_atlas_df(source_atlas)
-            logger.info("Cloning source atlas")
-            self.atlas = dp.make_atlas_from_spreadsheet(
-                source_atlas_df,
-                self.ids.atlas,
-                filetype="dataframe",
-                sheetname="",
-                polarity=self.ids.polarity,
-                store=True,
-                mz_tolerance=source_atlas.compound_identifications[0].mz_references[0].mz_tolerance,
-            )
+            try:
+                raise ValueError("Could not load atlas as source_atlas is None.")
+            except ValueError as err:
+                logger.exception(err)
+                raise err
+
+    def _clone_source_atlas(self) -> metob.Atlas:
+        logger.info("Retriving source atlas: %s", self.ids.source_atlas)
+        source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), self.ids.username)
+        source_atlas_df = ma_data.make_atlas_df(source_atlas)
+        logger.info("Cloning atlas %s")
+        return dp.make_atlas_from_spreadsheet(
+            source_atlas_df,
+            self.ids.atlas,
+            filetype="dataframe",
+            sheetname="",
+            polarity=self.ids.polarity,
+            store=True,
+            mz_tolerance=source_atlas.compound_identifications[0].mz_references[0].mz_tolerance,
+        )
 
-    def _build(self):
+    def _build(self) -> None:
         """Populate self._data from database and h5 files."""
         start_time = datetime.datetime.now()
         files = []
@@ -549,13 +635,14 @@ def _build(self):
             _duration_since(start_time),
         )
 
-    def _remove_compound_id(self, idx):
+    def _remove_compound_id(self, idx: int) -> None:
         """
         Remove compound identification at index idx from both in db and self.atlas
         Does not invalidate _data or _hits or _atlas_df
         This bypasses several ORM layers and therefore is a hack, but I couldn't get it to work with the ORM.
         """
         cid_id = self.atlas.compound_identifications[idx].unique_id
+        del self.atlas.compound_identifications[idx]
         atlas_id = self.atlas.unique_id
         link_table = "atlases_compound_identifications"
         target = f"target_id='{cid_id}'"
@@ -568,12 +655,14 @@ def _remove_compound_id(self, idx):
             if len(list(links)) == 0:  # other atlases are not linked to this CompoundIdentification
                 workspace.db.query(f"delete from compoundidentifications where unique_id='{cid_id}'")
             workspace.db.commit()
-            del self.atlas.compound_identifications[idx]
         except Exception as err:  # pylint: disable=broad-except
             metoh.rollback_and_log(workspace.db, err)
+            raise Exception from err
         workspace.close_connection()
 
-    def filter_compounds(self, keep_idxs=None, remove_idxs=None):
+    def filter_compounds(
+        self, keep_idxs: Optional[List[int]] = None, remove_idxs: Optional[List[int]] = None
+    ) -> None:
         """
         inputs:
             keep_idxs: the indexes of compounds to keep
@@ -589,19 +678,26 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None):
         if (keep_idxs is None) == (remove_idxs is None):
             raise ValueError("Exactly one of keep_idxs and remove_idxs should be None")
         start_len = len(self.atlas_df)
+        in_idxs: List[int]
+        out_idxs: List[int]
         if remove_idxs is not None:
             _error_if_bad_idxs(self.atlas_df, remove_idxs)
-            keep_idxs = self.atlas_df.index.difference(remove_idxs)
-        self._atlas_df = self.atlas_df.iloc[keep_idxs].copy().reset_index(drop=True)
+            out_idxs = remove_idxs
+            in_idxs = self.atlas_df.index.difference(out_idxs)
+        if keep_idxs is not None:
+            _error_if_bad_idxs(self.atlas_df, keep_idxs)
+            in_idxs = keep_idxs
+            out_idxs = [i for i, _ in enumerate(self.atlas.compound_identifications) if i not in in_idxs]
+        self._atlas_df = self.atlas_df.iloc[in_idxs].copy().reset_index(drop=True)
         if self._data is not None:
-            self._data = [
-                [compound for idx, compound in enumerate(sample) if idx in keep_idxs] for sample in self._data
-            ]
-        if remove_idxs is None:
-            remove_idxs = [
-                idx for idx, _ in enumerate(self.atlas.compound_identifications) if idx not in keep_idxs
-            ]
-        _ = [self._remove_compound_id(idx) for idx in sorted(remove_idxs, reverse=True)]
+            self._data = tuple(
+                MetatlasSample(
+                    tuple(compound for idx, compound in enumerate(sample.compounds) if idx in in_idxs)
+                )
+                for sample in self._data
+            )
+        for i in sorted(out_idxs, reverse=True):
+            self._remove_compound_id(i)
         logger.info(
             "Filtering reduced atlas from %d to %d compounds (%d removed).",
             start_len,
@@ -611,7 +707,7 @@ def filter_compounds(self, keep_idxs=None, remove_idxs=None):
         if self._hits is not None:
             self.filter_hits_by_atlas()
 
-    def filter_hits_by_atlas(self):
+    def filter_hits_by_atlas(self) -> None:
         """Remove any hits that do not have a corresponding inchi_key-adduct pair in atlas_df"""
         start_len = len(self.hits)
         keep_adducts = self.atlas_df.loc[:, ["inchi_key", "adduct"]].drop_duplicates()
@@ -630,7 +726,7 @@ def filter_hits_by_atlas(self):
             start_len - len(self.hits),
         )
 
-    def filter_compounds_ms1_notes_remove(self):
+    def filter_compounds_ms1_notes_remove(self) -> None:
         """
         output:
             updates self.atlas to contain only the compound_identifications that do not have ms1_notes
@@ -641,7 +737,7 @@ def filter_compounds_ms1_notes_remove(self):
         logger.debug("Filtering atlas to exclude ms1_notes=='remove'.")
         self.filter_compounds(remove_idxs=self.compound_indices_marked_remove())
 
-    def filter_compounds_by_signal(self, num_points, peak_height):
+    def filter_compounds_by_signal(self, num_points: int, peak_height: float) -> None:
         """
         inputs:
             num_points: number of points in EIC that must be exceeded in one or more samples
@@ -653,7 +749,7 @@ def filter_compounds_by_signal(self, num_points, peak_height):
         keep_idxs = dp.strong_signal_compound_idxs(self, num_points, peak_height)
         self.filter_compounds(keep_idxs=keep_idxs)
 
-    def store_atlas(self, even_if_exists=False):
+    def store_atlas(self, even_if_exists: bool = False) -> None:
         """
         inputs:
             even_if_exists: if True, will save the atlas even if the atlas name already is in the database
@@ -679,7 +775,7 @@ def store_atlas(self, even_if_exists=False):
             _duration_since(start_time),
         )
 
-    def export_atlas_to_csv(self, filename=None):
+    def export_atlas_to_csv(self, filename: Optional[str] = None) -> None:
         """
         save atlas, including ms1_notes, ms2_notes, identification_notes, rt_min, rt_max to filename
         if filename is not provided, then the export is saved to the working directory with filename
@@ -688,19 +784,19 @@ def export_atlas_to_csv(self, filename=None):
         filename = f"{self.atlas.name}.csv" if filename is None else filename
         dp.export_atlas_to_spreadsheet(self, filename)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> MetatlasSample:
         """get sample at idx"""
         return self.data[idx]
 
     @property
-    def data(self):
+    def data(self) -> Tuple[MetatlasSample, ...]:
         """data getter, update ._data if necessary"""
         if self._data is None:
             self._build()
-        return self._data
+        return cast(Tuple[MetatlasSample, ...], self._data)
 
     @property
-    def atlas_df(self):
+    def atlas_df(self) -> pd.DataFrame:
         """atlas_df getter, update ._atlas_df if necessary"""
         if self._atlas_df is None:
             start_time = datetime.datetime.now()
@@ -714,20 +810,20 @@ def atlas_df(self):
         return self._atlas_df
 
     @observe("atlas")
-    def _observe_atlas(self, signal):
+    def _observe_atlas(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._atlas_df = None
             self._data = None
             logger.debug("Change to atlas invalidates atlas_df, data")
 
     @observe("_atlas_df")
-    def _observe_atlas_df(self, signal):
+    def _observe_atlas_df(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._data = None
             logger.debug("Change to atlas_df invalidates data")
 
     @property
-    def polarity(self):
+    def polarity(self) -> Polarity:
         """
         polarity getter assumes all polarities within class are the same
         returns 'positive' if there are no samples or no compound identifications
@@ -735,43 +831,43 @@ def polarity(self):
         try:
             cid = self.data[0][0]["identification"]
         except IndexError:
-            return "positive"
-        return cid.mz_references[0].detected_polarity
+            return Polarity("positive")
+        return Polarity(cid.mz_references[0].detected_polarity)
 
     @observe("extra_time")
-    def _observe_extra_time(self, signal):
+    def _observe_extra_time(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._hits = None
             self._data = None
             logger.debug("Change to extra_time invalidates hits, data")
 
     @observe("extra_mz")
-    def _observe_extra_mz(self, signal):
+    def _observe_extra_mz(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._hits = None
             self._data = None
             logger.debug("Change to extra_mz invalidates hits, data")
 
     @observe("keep_nonmatches")
-    def _observe_keep_nonmatches(self, signal):
+    def _observe_keep_nonmatches(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._hits = None
             logger.debug("Change to keep_nonmatches invalidates hits")
 
     @observe("frag_mz_tolerance")
-    def _observe_frag_mz_tolerance(self, signal):
+    def _observe_frag_mz_tolerance(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._hits = None
             logger.debug("Change to frag_mz_tolerance invalidates hits")
 
     @observe("msms_refs_loc")
-    def _observe_msms_refs_loc(self, signal):
+    def _observe_msms_refs_loc(self, signal: ObserveHandler) -> None:
         if signal.type == "change":
             self._hits = None
             logger.debug("Change to msms_refs_loc invalidates hits")
 
     @property
-    def hits(self):
+    def hits(self) -> pd.DataFrame:
         """get msms hits DataFrame"""
         _ = self.atlas_df  # regenerate if needed before logging hits generation
         _ = self.data  # regenerate if needed before logging hits generation
@@ -794,19 +890,21 @@ def hits(self):
             self._hits_valid_for_rt_bounds = True
         return self._hits
 
-    def __len__(self):
+    def __len__(self) -> int:
         """len is from data"""
         return len(self.data)
 
     @property
-    def rts(self):
+    def rts(self) -> Tuple[metob.RtReference, ...]:
         """
         Allow Rt_Reference objects to be accessed
         use set_rt() if you want to modify the RT values held by this class.
         """
+        if self.atlas is None:
+            return tuple()  # noqa: C408
         return tuple(cid.rt_references[0] for cid in self.atlas.compound_identifications)
 
-    def set_rt(self, compound_idx, which, time):
+    def set_rt(self, compound_idx: int, which: str, time: float) -> None:
         """
         inputs:
             compound_idx: index of of compound to update
@@ -815,6 +913,12 @@ def set_rt(self, compound_idx, which, time):
         updates the RT value in database, self.atlas, self.atlas_df, self.data
         so that no datastructures need to be invalidated
         """
+        try:
+            if self.atlas is None:
+                raise ValueError("Cannot set RTs when atlas is None.")
+        except ValueError as err:
+            logger.exception(err)
+            raise err
         assert which in ["rt_min", "rt_peak", "rt_max"]
         atlas_rt_ref = self.atlas.compound_identifications[compound_idx].rt_references[0]
         setattr(atlas_rt_ref, which, time)
@@ -825,7 +929,7 @@ def set_rt(self, compound_idx, which, time):
         if which in ["rt_min", "rt_max"]:
             self._hits_valid_for_rt_bounds = False
 
-    def set_note(self, compound_idx, which, value):
+    def set_note(self, compound_idx: int, which: str, value: str) -> None:
         """
         inputs:
             compound_idx: index of of compound to update
@@ -834,6 +938,12 @@ def set_note(self, compound_idx, which, value):
         updates the notes value in database, self.atlas, self.atlas_df, self.data
         so that no datastructures need to be invalidated
         """
+        try:
+            if self.atlas is None:
+                raise ValueError("Cannot set notes when atlas is None.")
+        except ValueError as err:
+            logger.exception(err)
+            raise err
         assert which in ["ms1_notes", "ms2_notes", "identification_notes"]
         atlas_cid = self.atlas.compound_identifications[compound_idx]
         setattr(atlas_cid, which, value)
@@ -842,21 +952,21 @@ def set_note(self, compound_idx, which, value):
         self.atlas_df.loc[compound_idx, which] = value
         metob.store(atlas_cid)
 
-    def compound_indices_marked_remove(self):
+    def compound_indices_marked_remove(self) -> List[int]:
         """
         outputs:
             list of compound_idx of the compound identifications with ms1_notes to remove
         """
         ids = ["identification", "ms1_notes"]
-        return [i for i, j in enumerate(self.data[0]) if _is_remove(ma_data.extract(j, ids))]
+        return [i for i, j in enumerate(self.data[0].compounds) if _is_remove(ma_data.extract(j, ids))]
 
-    def compound_idxs_not_evaluated(self):
+    def compound_idxs_not_evaluated(self) -> List[int]:
         """NOT YET IMPLEMENTED"""
-        for compound_idx, _ in enumerate(self.data[0]):
-            print(compound_idx)
         return []
 
-    def annotation_gui(self, compound_idx=0, width=15, height=3, alpha=0.5, colors=""):
+    def annotation_gui(
+        self, compound_idx: int = 0, width: float = 15, height: float = 3, alpha: float = 0.5, colors=""
+    ) -> dp.adjust_rt_for_selected_compound:
         """
         Opens the interactive GUI for setting RT bounds and annotating peaks
         inputs:
@@ -877,7 +987,7 @@ def annotation_gui(self, compound_idx=0, width=15, height=3, alpha=0.5, colors="
             height=height,
         )
 
-    def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
+    def generate_all_outputs(self, msms_fragment_ions: bool = False, overwrite: bool = False) -> None:
         """
         Generates the default set of outputs for a targeted experiment
         inputs:
@@ -888,6 +998,8 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
             self._hits = None  # force hits to be regenerated
         self.extra_time = 0.5
         logger.info("extra_time set to 0.5 minutes for output generation.")
+        logger.info("Removing InjBl from exclude_groups.")
+        self.ids.exclude_groups = remove_items(self.ids.exclude_groups, ["InjBl"])
         targeted_output.write_atlas_to_spreadsheet(self, overwrite=overwrite)
         targeted_output.write_stats_table(self, overwrite=overwrite)
         targeted_output.write_chromatograms(self, overwrite=overwrite, max_cpus=self.max_cpus)
@@ -900,7 +1012,7 @@ def generate_all_outputs(self, msms_fragment_ions=False, overwrite=False):
         targeted_output.copy_outputs_to_google_drive(self.ids)
 
 
-def _duration_since(start):
+def _duration_since(start: datetime.datetime) -> str:
     """
     inputs:
         start: a datetime object of when the duration started
@@ -910,12 +1022,12 @@ def _duration_since(start):
     return humanize.precisedelta(datetime.datetime.now() - start)
 
 
-def _is_remove(obj):
+def _is_remove(obj: object) -> bool:
     """is obj a string that starts with 'remove' (case insensitive)?"""
     return isinstance(obj, str) and obj.lower().startswith("remove")
 
 
-def _set_nested(data, ids, value):
+def _set_nested(data: Any, ids: List[Union[int, str, Tuple[str]]], value: Any):
     """
     inputs:
         data: hierarchical data structure consisting of lists, dicts, and objects with attributes.
@@ -952,7 +1064,7 @@ def _set_nested(data, ids, value):
             _set_nested(data[ids[0]], ids[1:], value)
 
 
-def _error_if_bad_idxs(dataframe, test_idx_list):
+def _error_if_bad_idxs(dataframe: pd.DataFrame, test_idx_list: List[int]) -> None:
     """Raise IndexError if any members of of test_idx_list are not in dataframe's index"""
     bad = set(test_idx_list) - set(dataframe.index)
     try:
@@ -963,7 +1075,7 @@ def _error_if_bad_idxs(dataframe, test_idx_list):
         raise err
 
 
-def get_atlas(name, username):
+def get_atlas(name: AtlasName, username: Username) -> metob.Atlas:
     """Load atlas from database"""
     atlases = metob.retrieve("Atlas", name=name, username=username)
     try:
@@ -981,12 +1093,20 @@ def get_atlas(name, username):
     return atlases[0]
 
 
-def quoted_string_list(strings):
+def quoted_string_list(strings: List[str]) -> str:
     """Adds double quotes around each string and seperates with ', '."""
     return ", ".join([f'"{x}"' for x in strings])
 
 
-def append_inverse(in_list: List[str], polarity: str):
-    """ appends short version of inverse of polarity to and retuns the list """
-    inverse = {'positive': 'NEG', 'negative': 'POS'}
+def append_inverse(in_list: List[str], polarity: Polarity) -> List[str]:
+    """appends short version of inverse of polarity to and retuns the list"""
+    inverse = {"positive": "NEG", "negative": "POS"}
     return in_list + [inverse[polarity]] if polarity in inverse.keys() else in_list
+
+
+def remove_items(edit_list: List[str], remove_list: List[str], ignore_case: bool = True) -> List[str]:
+    """Returns list of items in edit_list but not in remove_list"""
+    if ignore_case:
+        lower_remove_list = [x.lower() for x in remove_list]
+        return [x for x in edit_list if x.lower() not in lower_remove_list]
+    return [x for x in edit_list if x not in remove_list]
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index ff9e6b8e..77fc09ac 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -53,10 +53,16 @@ def test_filter_compounds01(metatlas_dataset_with_2_cids):
     metatlas_dataset = metatlas_dataset_with_2_cids
     metatlas_dataset.filter_compounds(remove_idxs=[])
     assert len(metatlas_dataset[0]) == 2
+    assert len(metatlas_dataset.atlas.compound_identifications) == 2
+    assert metatlas_dataset.atlas_df.shape[0] == 2
     metatlas_dataset.filter_compounds(keep_idxs=[0, 1])
     assert len(metatlas_dataset[0]) == 2
+    assert len(metatlas_dataset.atlas.compound_identifications) == 2
+    assert metatlas_dataset.atlas_df.shape[0] == 2
     metatlas_dataset.filter_compounds(keep_idxs=[])
     assert len(metatlas_dataset[0]) == 0
+    assert len(metatlas_dataset.atlas.compound_identifications) == 0
+    assert metatlas_dataset.atlas_df.shape[0] == 0
     with pytest.raises(ValueError):
         metatlas_dataset.filter_compounds()
 

From e61601ecbebb30293fe236ac3a9e6cf12bfe68c2 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 30 Aug 2021 11:06:09 -0700
Subject: [PATCH 113/177] Change GDrive upload folder

---
 metatlas/io/targeted_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index ddabadd3..2e1a4e61 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -319,6 +319,6 @@ def copy_outputs_to_google_drive(ids):
     if drive is None:
         logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
         return
-    sub_folder = os.path.join("analysis_uploads", ids.experiment, ids.analysis, ids.output_type)
+    sub_folder = os.path.join("Analysis_uploads", ids.experiment, ids.analysis, ids.output_type)
     rci.copy_to_drive(ids.output_dir, drive, sub_folder)
     logger.info("Done copying output files to Google Drive")

From 865b74d552fa7fc5c927153be1d2ab8dfca2c48e Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 30 Aug 2021 11:09:56 -0700
Subject: [PATCH 114/177] set OPENBLAS_NUM_THREADS=1

---
 metatlas/tools/notebook.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 1668fea2..88f8be26 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -25,6 +25,7 @@ def configure_environment(log_level):
     logger.debug("Running import and environment setup block of notebook.")
     logger.debug("Configuring notebook environment with console log level of %s.", log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+    os.environ['OPENBLAS_NUM_THREADS'] = '1'
     logger.info("Running on git commit: %s", get_repo_hash())
 
 

From faa6699397908c4bb06530d969f79481bf810cff Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 30 Aug 2021 11:28:06 -0700
Subject: [PATCH 115/177] fix typo in launch_rt_prediction.sh

---
 papermill/launch_rt_prediction.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index 5a0b4644..9be26844 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -2,7 +2,7 @@
 set -euf -o pipefail
 
 if [ "$#" -ne 3 ]; then
-    echo "Usage $: experiment_name analysis_number project_directory"
+    echo "Usage $0: experiment_name analysis_number project_directory"
     exit 0
 fi
 

From a05ad98a792698d7ddba29be403563a0e1c3454a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 30 Aug 2021 15:13:12 -0700
Subject: [PATCH 116/177] add missing variable to logging statement

---
 metatlas/datastructures/metatlas_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 78f37c5a..c749e2b8 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -597,7 +597,7 @@ def _clone_source_atlas(self) -> metob.Atlas:
         logger.info("Retriving source atlas: %s", self.ids.source_atlas)
         source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), self.ids.username)
         source_atlas_df = ma_data.make_atlas_df(source_atlas)
-        logger.info("Cloning atlas %s")
+        logger.info("Cloning atlas %s", self.ids.source_atlas)
         return dp.make_atlas_from_spreadsheet(
             source_atlas_df,
             self.ids.atlas,

From a01699d8ad04463773df7329bfccfd2a6eb8fce4 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 1 Sep 2021 15:31:26 -0700
Subject: [PATCH 117/177] Include proposal ID in Final Id spreadsheet name

---
 metatlas/datastructures/metatlas_dataset.py | 18 ++++++++++++++++--
 metatlas/io/targeted_output.py              |  2 +-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index c749e2b8..a1beaa06 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -961,8 +961,17 @@ def compound_indices_marked_remove(self) -> List[int]:
         return [i for i, j in enumerate(self.data[0].compounds) if _is_remove(ma_data.extract(j, ids))]
 
     def compound_idxs_not_evaluated(self) -> List[int]:
-        """NOT YET IMPLEMENTED"""
-        return []
+        """
+        Returns list of compound indices where ms1 note is not 'remove' and
+        ms2 note is None or 'no selection'
+        """
+        out = []
+        for i, compound in enumerate(self.data[0].compounds):
+            ms1_note = ma_data.extract(compound, ["identification", "ms1_notes"])
+            ms2_note = ma_data.extract(compound, ["identification", "ms2_notes"])
+            if (not _is_remove(ms1_note)) and (not _has_selection(ms2_note)):
+                out.append(i)
+        return out
 
     def annotation_gui(
         self, compound_idx: int = 0, width: float = 15, height: float = 3, alpha: float = 0.5, colors=""
@@ -1027,6 +1036,11 @@ def _is_remove(obj: object) -> bool:
     return isinstance(obj, str) and obj.lower().startswith("remove")
 
 
+def _has_selection(obj: object) -> bool:
+    """is obj a string that is not None, '', or 'no selection' (case insensitive)?"""
+    return not (obj is None or obj == '' or obj.lower() == 'no selection')
+
+
 def _set_nested(data: Any, ids: List[Union[int, str, Tuple[str]]], value: Any):
     """
     inputs:
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 2e1a4e61..d111ae72 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -83,7 +83,7 @@ def write_stats_table(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
         output_loc=metatlas_dataset.ids.output_dir,
-        output_sheetname="Draft_Final_Identifications.xlsx",
+        output_sheetname=f"{metatlas_dataset.ids.project}_Final_Identifications.xlsx",
         min_peak_height=1e5,
         use_labels=True,
         min_msms_score=0.01,

From 0fcac648e23d79c3047b2e0f012b7e6e75223eb0 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 2 Sep 2021 09:22:04 -0700
Subject: [PATCH 118/177] Add pre/post_annotation functions

to move a litle more code out of the Targeted notebook
---
 metatlas/datastructures/metatlas_dataset.py | 57 ++++++++++++++++++-
 metatlas/tools/notebook.py                  |  2 +-
 notebooks/reference/Targeted.ipynb          | 62 ++++++---------------
 tests/unit/test_metatlas_dataset.py         |  8 +++
 4 files changed, 83 insertions(+), 46 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index a1beaa06..a40b535d 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -973,6 +973,21 @@ def compound_idxs_not_evaluated(self) -> List[int]:
                 out.append(i)
         return out
 
+    def error_if_not_all_evaluated(self) -> None:
+        """Raises ValueError if there are compounds that have not been evaluated"""
+        not_evaluated = self.compound_idxs_not_evaluated()
+        try:
+            if len(not_evaluated) != 0:
+                raise ValueError(
+                    (
+                        "Compounds with the following indices need notes selected via radio "
+                        f"buttons before continuing: {','.join([str(i) for i in not_evaluated])}"
+                    )
+                )
+        except ValueError as err:
+            logger.exception(err)
+            raise err
+
     def annotation_gui(
         self, compound_idx: int = 0, width: float = 15, height: float = 3, alpha: float = 0.5, colors=""
     ) -> dp.adjust_rt_for_selected_compound:
@@ -1038,7 +1053,9 @@ def _is_remove(obj: object) -> bool:
 
 def _has_selection(obj: object) -> bool:
     """is obj a string that is not None, '', or 'no selection' (case insensitive)?"""
-    return not (obj is None or obj == '' or obj.lower() == 'no selection')
+    if obj is None or not isinstance(obj, str):
+        return False
+    return obj.lower() not in ["", "no selection"]
 
 
 def _set_nested(data: Any, ids: List[Union[int, str, Tuple[str]]], value: Any):
@@ -1124,3 +1141,41 @@ def remove_items(edit_list: List[str], remove_list: List[str], ignore_case: bool
         lower_remove_list = [x.lower() for x in remove_list]
         return [x for x in edit_list if x.lower() not in lower_remove_list]
     return [x for x in edit_list if x not in remove_list]
+
+
+def pre_annotation(
+    source_atlas: AtlasName,
+    experiment: Experiment,
+    output_type: OutputType,
+    polarity: Polarity,
+    analysis_number: AnalysisNumber,
+    project_directory: PathString,
+    google_folder: str,
+    groups_controlled_vocab: GroupMatchList,
+    exclude_files: FileMatchList,
+    num_points: int,
+    peak_height: float,
+    max_cpus: int,
+) -> MetatlasDataset:
+    ids = AnalysisIdentifiers(
+        source_atlas=source_atlas,
+        experiment=experiment,
+        output_type=output_type,
+        polarity=polarity,
+        analysis_number=analysis_number,
+        project_directory=project_directory,
+        google_folder=google_folder,
+        groups_controlled_vocab=groups_controlled_vocab,
+        exclude_files=exclude_files,
+    )
+    metatlas_dataset = MetatlasDataset(ids=ids, max_cpus=max_cpus)
+    if metatlas_dataset.ids.output_type in ["FinalEMA-HILIC"]:
+        metatlas_dataset.filter_compounds_by_signal(num_points=num_points, peak_height=peak_height)
+    return metatlas_dataset
+
+
+def post_annotation(metatlas_dataset: MetatlasDataset) -> None:
+    if metatlas_dataset.ids.output_type in ["FinalEMA-HILIC"]:
+        metatlas_dataset.error_if_not_all_evaluated()
+        metatlas_dataset.filter_compounds_ms1_notes_remove()
+    metatlas_dataset.generate_all_outputs()
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 88f8be26..145e57de 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -25,7 +25,7 @@ def configure_environment(log_level):
     logger.debug("Running import and environment setup block of notebook.")
     logger.debug("Configuring notebook environment with console log level of %s.", log_level)
     os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
-    os.environ['OPENBLAS_NUM_THREADS'] = '1'
+    os.environ["OPENBLAS_NUM_THREADS"] = "1"
     logger.info("Running on git commit: %s", get_repo_hash())
 
 
diff --git a/notebooks/reference/Targeted.ipynb b/notebooks/reference/Targeted.ipynb
index 8676fdcb..41fe8e61 100644
--- a/notebooks/reference/Targeted.ipynb
+++ b/notebooks/reference/Targeted.ipynb
@@ -160,38 +160,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ids = mads.AnalysisIdentifiers(\n",
-    "    source_atlas=source_atlas,\n",
-    "    experiment=experiment,\n",
-    "    output_type=output_type,\n",
-    "    polarity=polarity,\n",
-    "    analysis_number=analysis_number,\n",
-    "    project_directory=project_directory,\n",
-    "    google_folder=google_folder,\n",
-    "    groups_controlled_vocab=groups_controlled_vocab,\n",
-    "    exclude_files=exclude_files,\n",
+    "metatlas_dataset = mads.pre_annotation(\n",
+    "    source_atlas,\n",
+    "    experiment,\n",
+    "    output_type,\n",
+    "    polarity,\n",
+    "    analysis_number,\n",
+    "    project_directory,\n",
+    "    google_folder,\n",
+    "    groups_controlled_vocab,\n",
+    "    exclude_files,\n",
+    "    num_points,\n",
+    "    peak_height,\n",
+    "    max_cpus,\n",
     ")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metatlas_dataset = mads.MetatlasDataset(ids=ids, max_cpus=max_cpus)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if metatlas_dataset.ids.output_type in [\"FinalEMA-HILIC\"]:\n",
-    "    metatlas_dataset.filter_compounds_by_signal(num_points=num_points, peak_height=peak_height)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -215,26 +199,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if metatlas_dataset.ids.output_type in [\"FinalEMA-HILIC\"]:\n",
-    "    metatlas_dataset.filter_compounds_ms1_notes_remove()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metatlas_dataset.generate_all_outputs()"
+    "mads.post_annotation(metatlas_dataset)"
    ]
   }
  ],
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-    "display_name": "Metatlas Targeted",
-    "language": "python",
-    "name": "metatlas-targeted"
+   "display_name": "Metatlas Targeted",
+   "language": "python",
+   "name": "metatlas-targeted"
   },
   "language_info": {
    "codemirror_mode": {
@@ -246,7 +220,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 77fc09ac..58b0b8d4 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -696,3 +696,11 @@ def test_invlidate_groups_controlled_vocab01(analysis_ids):
     assert analysis_ids._lcmsruns is not None
     analysis_ids.groups_controlled_vocab = ["FOOBAR"]
     assert analysis_ids._lcmsruns is None
+
+
+def test_has_selection01():
+    assert mads._has_selection("foobar")
+    assert not mads._has_selection(None)
+    assert not mads._has_selection("")
+    assert not mads._has_selection("no selection")
+    assert not mads._has_selection("NO Selection")

From 4285cc953e96d7fd1da285224d615be1270b294a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 2 Sep 2021 10:00:26 -0700
Subject: [PATCH 119/177] non-code changes for linting

---
 metatlas/datastructures/metatlas_dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index a40b535d..886ba794 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -1143,6 +1143,7 @@ def remove_items(edit_list: List[str], remove_list: List[str], ignore_case: bool
     return [x for x in edit_list if x not in remove_list]
 
 
+# pylint: disable=too-many-arguments
 def pre_annotation(
     source_atlas: AtlasName,
     experiment: Experiment,
@@ -1157,6 +1158,7 @@ def pre_annotation(
     peak_height: float,
     max_cpus: int,
 ) -> MetatlasDataset:
+    """All data processing that needs to occur before the annotation GUI in Targeted notebook"""
     ids = AnalysisIdentifiers(
         source_atlas=source_atlas,
         experiment=experiment,
@@ -1175,6 +1177,7 @@ def pre_annotation(
 
 
 def post_annotation(metatlas_dataset: MetatlasDataset) -> None:
+    """All data processing that needs to occur after the annotation GUI in Targeted notebook"""
     if metatlas_dataset.ids.output_type in ["FinalEMA-HILIC"]:
         metatlas_dataset.error_if_not_all_evaluated()
         metatlas_dataset.filter_compounds_ms1_notes_remove()

From c38c749a5d9ae0057beadaf7bdb68011125bf681 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 2 Sep 2021 10:35:06 -0700
Subject: [PATCH 120/177] Log when targeted notebook is done

---
 metatlas/datastructures/metatlas_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 886ba794..8b5331e2 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -1182,3 +1182,4 @@ def post_annotation(metatlas_dataset: MetatlasDataset) -> None:
         metatlas_dataset.error_if_not_all_evaluated()
         metatlas_dataset.filter_compounds_ms1_notes_remove()
     metatlas_dataset.generate_all_outputs()
+    logger.info("DONE - execution of notebook is complete.")

From 7cbc7637a2301f29f0dad1598f51a843f9266c0a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 2 Sep 2021 12:26:36 -0700
Subject: [PATCH 121/177] close annotation GUI before generating outputs

---
 metatlas/plots/dill2plots.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index a5fbac0e..a1953006 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -152,6 +152,9 @@
               'comp_num': '1',
               'mass': '0'}}
 
+GUI_FIG_LABEL = 'Annotation GUI'
+
+
 def get_google_sheet(notebook_name = "Sheet name",
                      token='/project/projectdirs/metatlas/projects/google_sheets_auth/ipython to sheets demo-9140f8697062.json',
                      sheet_name = 'Sheet1',
@@ -506,8 +509,8 @@ def layout_figure(self):
 
         # create figure and first axes
         combined_plot_height = self.height * (2 + self.plot_hspace)
-        self.fig, (self.ax2, self.ax) = plt.subplots(2, 1, figsize=(self.width,
-                                                                    combined_plot_height))
+        self.fig, (self.ax2, self.ax) = plt.subplots(2, 1, num=GUI_FIG_LABEL,
+                                                     figsize=(self.width, combined_plot_height))
         plt.subplots_adjust(left=self.plot_left_pos, right=self.plot_right_pos,
                             bottom=self.plot_bottom_pos, top=self.plot_top_pos,
                             hspace=self.plot_hspace)
@@ -757,6 +760,12 @@ def get_similar_compounds(self, use_labels=True):
                                                           self.data.rts[compound_iter_idx])})
         return out
 
+    @staticmethod
+    def disable():
+        """Stops the GUI from being updated and interfering with the generation of output figures"""
+        plt.close(GUI_FIG_LABEL)
+
+
 class adjust_mz_for_selected_compound(object):
     def __init__(self,
                  data,
@@ -1067,7 +1076,7 @@ def plot_all_compounds_for_each_file(input_dataset = [], input_fname = '', inclu
     if not os.path.exists(output_loc):
         os.makedirs(output_loc)
 
-
+    disable_interactive_plots()
     for file_idx,my_file in enumerate(file_names):
         ax = plt.subplot(111)#, aspect='equal')
         plt.setp(ax, 'frame_on', False)
@@ -1176,7 +1185,7 @@ def plot_all_files_for_each_compound(input_dataset = [], input_fname = '', inclu
     # create ouput dir
     if not os.path.exists(output_loc):
         os.makedirs(output_loc)
-    plt.ioff()
+    disable_interactive_plots()
     for compound_idx,compound in enumerate(compound_names):
         ax = plt.subplot(111)#, aspect='equal')
         plt.setp(ax, 'frame_on', False)
@@ -1540,7 +1549,7 @@ def plot_errorbar_plots(df,output_loc='', use_shortnames=True, ylabel=""):
     if not os.path.exists(output_loc):
         os.makedirs(output_loc)
 
-    plt.ioff()
+    disable_interactive_plots()
     for compound in df.index:
         if 'short groupname' in df.columns.names and use_shortnames:
             m = df.loc[compound].groupby(level='short groupname').mean()
@@ -1569,7 +1578,7 @@ def plot_errorbar_plots(df,output_loc='', use_shortnames=True, ylabel=""):
 
 def make_boxplot_plots(df, output_loc='', use_shortnames=True, ylabel="", overwrite=True, max_cpus=1):
     output_loc = os.path.expandvars(output_loc)
-    plt.ioff()
+    disable_interactive_plots()
     args = [(compound, df, output_loc, use_shortnames, ylabel, overwrite) for compound in df.index]
     parallel.parallel_process(make_boxplot, args, max_cpus, unit='plot')
 
@@ -1653,8 +1662,7 @@ def make_identification_figure(frag_json_dir = '/project/projectdirs/metatlas/pr
 
 
     frag_refs = pd.read_json(os.path.join(frag_json_dir, frag_json_name + ".json"))
-
-
+    disable_interactive_plots()
     for compound_idx in range(len(compound_names)):
         file_idx = None
         file_precursor_intensity = 0
@@ -1713,7 +1721,6 @@ def make_identification_figure(frag_json_dir = '/project/projectdirs/metatlas/pr
 #                 print data[file_idx][compound_idx]['identification'].compound[0].name, float(intensity[sx[0]]), float(min(ref_intensity))
                 ax.vlines(ref_mz,ref_zeros,[r*s for r in ref_intensity],colors='r',linewidth = 2)
 #                 print "we have reference spectra", len(ref_spec[0])
-            plt.ioff()
             plt.axhline()
             plt.tight_layout()
             L = plt.ylim()
@@ -2175,6 +2182,7 @@ def make_chromatograms(input_dataset, include_lcmsruns=None, exclude_lcmsruns=No
     prefix = f"{polarity}_" if polarity != '' else ''
     chromatogram_dir = os.path.join(output_loc, f"{prefix}compound_EIC_chromatograms")
     args_list = []
+    disable_interactive_plots()
     for compound_idx, my_compound in enumerate(compound_names):
         my_data = [data[file_idx][compound_idx] for file_idx, _ in enumerate(file_names)]
         args_list.append({'data': my_data,
@@ -2205,7 +2213,7 @@ def make_identification_figure_v2(input_fname='', input_dataset=[], include_lcms
     compound_names = ma_data.get_compound_names(data, use_labels)[0]
     file_names = ma_data.get_file_names(data)
     match = pd.DataFrame()
-    plt.ioff()
+    disable_interactive_plots()
     plt.clf()
     for compound_idx, _ in enumerate(compound_names):
         file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], []
@@ -2398,6 +2406,7 @@ def plot_ms1_spectra(polarity = None, mz_min = 5, mz_max = 5, input_fname = '',
     lcms_polarity = 'ms1_' + polarity[:3]
     titles = ['Unscaled', 'Scaled', 'Full Range']
 
+    disable_interactive_plots()
     for compound_idx in [i for i,c in enumerate(all_compound_names) if c in compound_names]:
         print(('compound is',compound_idx))
         #Find file_idx of with highest RT peak
@@ -2425,7 +2434,6 @@ def plot_ms1_spectra(polarity = None, mz_min = 5, mz_max = 5, input_fname = '',
                            (df_all['mz'] < mz_peak_actual + mz_max) ]
 
         #Plot compound name, mz, and RT peak
-        plt.ioff()
         fig = plt.gcf()
         fig.suptitle('%s, m/z: %5.4f, rt: %f'%(all_compound_names[compound_idx], mz_peak_actual, rt_peak_actual),
                                                 fontsize=8,weight='bold')
@@ -3273,3 +3281,9 @@ def rt_range_overlaps(rt1, rt2):
     """
     return ((rt2.rt_min <= rt1.rt_min <= rt2.rt_max) or (rt2.rt_min <= rt1.rt_max <= rt2.rt_max) or
             (rt1.rt_min <= rt2.rt_min <= rt1.rt_max) or (rt1.rt_min <= rt2.rt_max <= rt1.rt_max))
+
+
+def disable_interactive_plots():
+    """Close interactive figures and turn off interactive plotting"""
+    adjust_rt_for_selected_compound.disable()
+    plt.ioff()

From e347c2a2bc36e88c2a42b5115d3767a9d8965a12 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 3 Sep 2021 12:23:53 -0700
Subject: [PATCH 122/177] Add metadata to Final_Identifications filename

---
 metatlas/io/targeted_output.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index d111ae72..14a5f1fe 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -61,10 +61,9 @@ def write_stats_table(
                        if False filter out row if MSMS thresholds are not passing
         overwrite: if True, will write over existing files
     """
-    prefix = f"{metatlas_dataset.ids.short_polarity}_"
-    scores_path = os.path.join(
-        metatlas_dataset.ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv"
-    )
+    ids = metatlas_dataset.ids
+    prefix = f"{ids.short_polarity}_"
+    scores_path = os.path.join(ids.output_dir, f"{prefix}stats_tables", f"{prefix}compound_scores.csv")
     _ = metatlas_dataset.hits  # regenerate hits if needed before logging about scores
     logger.info("Calculating scores and exporting them to %s.", scores_path)
     scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)
@@ -82,15 +81,15 @@ def write_stats_table(
     fa.make_stats_table(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
-        output_loc=metatlas_dataset.ids.output_dir,
-        output_sheetname=f"{metatlas_dataset.ids.project}_Final_Identifications.xlsx",
+        output_loc=ids.output_dir,
+        output_sheetname=f"{ids.project}_{ids.output_type}_Identifications.xlsx",
         min_peak_height=1e5,
         use_labels=True,
         min_msms_score=0.01,
         min_num_frag_matches=1,
         include_lcmsruns=[],
         exclude_lcmsruns=["QC"],
-        polarity=metatlas_dataset.ids.short_polarity,
+        polarity=ids.short_polarity,
         overwrite=overwrite,
     )
 

From c827efe371d114c70f9a84200d71e9cc1cfa49b2 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 3 Sep 2021 12:24:49 -0700
Subject: [PATCH 123/177] automate more of Final_Identifications

---
 metatlas/tools/fastanalysis.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 9d02cbbd..13eb2583 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -4,7 +4,7 @@
 import os
 import multiprocessing as mp
 import pprint
-from six.moves import range
+import statistics
 
 import numpy as np
 import pandas as pd
@@ -94,7 +94,6 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     msms_hits_df.reset_index(inplace=True)
 
     for compound_idx, compound_name in enumerate(compound_names):
-
         ref_rt_peak = dataset[0][compound_idx]['identification'].rt_references[0].rt_peak
         ref_mz = dataset[0][compound_idx]['identification'].mz_references[0].mz
 
@@ -222,15 +221,8 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'exact_mass'] = cid.compound[0].mono_isotopic_molecular_weight
             final_df.loc[compound_idx, 'inchi_key'] = cid.compound[0].inchi_key
 
-        if file_idxs != []:
-            if len(mz_sample_matches) == 1:
-                final_df.loc[compound_idx, 'msms_quality'] = 0
-            elif scores[0] >= 0.8:
-                final_df.loc[compound_idx, 'msms_quality'] = 1
-            else:
-                final_df.loc[compound_idx, 'msms_quality'] = ""
-        else:
-            final_df.loc[compound_idx, 'msms_quality'] = 0
+        final_df.loc[compound_idx, 'identified_metabolite'] = final_df.loc[compound_idx, 'overlapping_compound'] or final_df.loc[compound_idx, 'label']
+        final_df.loc[compound_idx, 'msms_quality'] = ""  # this gets updated after ms2_notes column is added
 
         if delta_ppm <= 5 or delta_mz <= 0.001:
             final_df.loc[compound_idx, 'mz_quality'] = 1
@@ -250,13 +242,23 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'rt_quality'] = 0
         else:
             final_df.loc[compound_idx, 'rt_quality'] = ""
-
-        final_df.loc[compound_idx, 'total_score'] = ""
-        final_df.loc[compound_idx, 'msi_level'] = ""
+        final_df.loc[compound_idx, 'total_score'] = ""  # this gets updated after ms2_notes column is added
+        final_df.loc[compound_idx, 'msi_level'] = ""    # this gets updated after ms2_notes column is added
         final_df.loc[compound_idx, 'isomer_details'] = ""
         final_df.loc[compound_idx, 'identification_notes'] = cid.identification_notes
         final_df.loc[compound_idx, 'ms1_notes'] = cid.ms1_notes
         final_df.loc[compound_idx, 'ms2_notes'] = cid.ms2_notes
+        final_df.loc[compound_idx, 'msms_quality'] = int(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
+        scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
+        final_df.loc[compound_idx, 'total_score'] = sum([x if x != '' else 0 for x in scores])
+        if final_df.loc[compound_idx, 'msms_quality'] == -1:
+            final_df.loc[compound_idx, 'msi_level'] = "REMOVE, INVALIDATED BY BAD MSMS MATCH"
+        elif statistics.median(scores) < 1:
+            final_df.loc[compound_idx, 'msi_level'] = "putative"
+        elif sum(scores) == 3:
+            final_df.loc[compound_idx, 'msi_level'] = "Exceeds Level 1"
+        else:
+            final_df.loc[compound_idx, 'msi_level'] = "Level 1"
         if len(intensities) > 0:
             final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[intensities['intensity'].idxmax()]['intensity']
             max_intensity_file_id = int(intensities.loc[intensities['intensity'].idxmax()]['file_id'])

From 7b1b6552641b08f9fa8660599016d186960fe89f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 3 Sep 2021 16:22:37 -0700
Subject: [PATCH 124/177] Install kernel in RT adjust slurm script

---
 papermill/launch_rt_prediction.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index 9be26844..4ec2225b 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -14,6 +14,8 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR="$(dirname "$SCRIPT_DIR")"
 EXP_DIR="${PROJECT_DIR}/$EXP"
 ANALYSIS_DIR="${EXP_DIR}/${USER}${ANALYSIS_NUM}"
+KERNEL_SOURCE="${SCRIPT_DIR}/notebooks/kernels/metatlas-targeted.kernel.json"
+KERNEL_DESTINATION="${HOME}/.local/share/jupyter/kernels/metatlas-targeted/kernel.json"
 
 IFS='_' read -ra TOKENS <<< "$EXP"
 PROPOSAL="${TOKENS[3]}"
@@ -22,5 +24,8 @@ export IN_FILE="${REPO_DIR}/notebooks/reference/RT_Prediction.ipynb"
 export OUT_FILE="${ANALYSIS_DIR}/${PROPOSAL}_RT_Prediction_papermill.ipynb"
 export PARAMETERS="-p experiment $EXP -p metatlas_repo_path $REPO_DIR -p project_directory $PROJECT_DIR -p max_cpus 32 -p analysis_number $ANALYSIS_NUM"
 
+mkdir -p "${HOME}/.local/share/jupyter/kernels/metatlas-targeted"
+cp "$KERNEL_SOURCE" "$KERNEL_DESTINATION"
+
 mkdir -p "$ANALYSIS_DIR"
 sbatch -J "${PROPOSAL}_RT_Pred" "${REPO_DIR}/papermill/slurm_template.sh"

From 72996193aeb9de13aece467d919f8925a77524ea Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 3 Sep 2021 22:11:10 -0700
Subject: [PATCH 125/177] Add Targeted Analysis workflow documentation

---
 docs/Targeted_Analysis.md | 110 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 docs/Targeted_Analysis.md

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
new file mode 100644
index 00000000..953cec96
--- /dev/null
+++ b/docs/Targeted_Analysis.md
@@ -0,0 +1,110 @@
+# Targeted HILIC/Polar Analysis Workflow
+
+## One-time setup
+
+### RClone configuration
+
+#### For MacOS/Linux
+Open a terminal and run:
+```
+curl https://rclone.org/install.sh | sudo bash
+rclone config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0dms
+scp $(rclone config file | tail -1) \ dtn01.nersc.gov:~/.config/rclone/rclone.config
+```
+
+#### For Windows
+
+1. Download and unzip [rclone-current-windows-amd64.zip](https://downloads.rclone.org/rclone-current-windows-amd64.zip).
+2. Open a powershell window
+3. Run `rclone.exe config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0dms`
+4. Find your RClone configuration file location by running `rclone config file`
+5. Transfer the RClone configuration file to `~/.config/rclone/rclone.conf` on cori
+
+### Checkout Metatlas code from git repository
+
+On Cori run:
+```
+cd
+git clone https://github.com/biorack/metatlas.git
+cd metatlas
+git checkout oo_mads3
+```
+
+### Make directory to store work in progress
+
+On Cori run
+```
+mkdir -p ~/metabolomics_data
+```
+
+## Per-project workflow
+
+### Get updates to the metatlas code
+
+On Cori run:
+```
+cd metatlas
+git pull
+```
+
+### Perform RT correction
+
+#### Parameters
+The `experiment_name` parameter can be found in the [Sample Tracking and QC Checkpoints - Northen Lab](https://docs.google.com/spreadsheets/d/126t1OeXQnCCgP6e-6Pac_Ku_A1R7MQLm_tl_Dkqsv_w/edit#gid=1548851545) Google Sheet. The experiment names can be found on the 'New Extraction' sheet in either column 'M' or 'N' depending on the type of chromotography that was performed. This value will be something like `20210723_JGI-AK_DB-TM_506963_LemCreek_final_QE-HF_HILICZ_USHXG01494`.
+
+The `analysis_number` parameter is an integer that you'll need to increment if you redo an analysis. It should be set to 0 initially.
+
+The `project_directory` is where you want to store the analysis while working on it. You should use `~/metabolomics_data`.
+
+#### Execution
+
+On Cori run (where you substitute the 3 parameters described above):
+```
+cd ~/metatlas/papermill
+./launch_rt_prediction.sh experiment_name analysis_number project_directory
+```
+
+This will submit a slurm job. You will receive an email when the job starts executing and when it has completed.
+
+#### Outputs
+
+Once the job has completed, you should check the files generated to make sure the RT correction models look acceptable. You can find the output files at `~/metabolomics_data/EXPERIMENT_NAME/${USER}ANALYSIS_NUMBER/`.
+
+
+### Perform ISTDsEtc Analysis
+
+1. Launch [jupyter.nersc.gov](https://jupyter.nersc.gov/) in your web browser and start a 'Shared CPU Node'
+2. Open `~/metabolomics_data/EXPERIMENT_NAME/${USER}ANALYSIS_NUMBER/PROJECT_ID_ISTDsEtc_POS.ipynb` within JupyterLab (you no longer need to use the Classic Notebook interface). If you are prompted to select a kernel, select 'Metatlas Targeted'.
+3. The first code cell of the notebook contains descriptions of the parameters and their default values. The second code cell of the notebook contain parameter values that were auto-populated from the RT correction slurm job. These values in the second code block will override the default values from the first code block. The third code block validates your parameter values and also validates that your environment is correctly configured. Execute the first 3 code cells and see if there are any errors. If you get an error message (usually error messages will be in red), you will need to correct the issue so that the cell executes without giving an error before moving on. The error messages commonly see at this point in the workflow generally include some description of what action is needed to correct the problem.
+4. Execute the code blocks 4 and 5 to read in data and bring up the Annotation GUI.
+5. For each of the compound-adduct pairs in your atlas, set the RT min and RT max boundaries to just contain the EIC peak that corresponds to the compound you are currently evaluating. For each compound-adduct pair, you must either select one of the MSMS-quality descriptors (upper set of radio buttons) or use the bottom set of radio buttons to mark the compound-adduct pair for removal. Failure to set either MSMS-quality descriptors or the remove state for each compound-adduct pair will result in the subsequent step throwing an error.
+6. Execute the 6th code block to generate output files and upload them to Google Drive.
+7. Review your output files, which will be under [this Google Drive folder](https://drive.google.com/drive/folders/19Ofs5AHB3O8-NYApJUwj4YvH8TbKCGJW?usp=sharing).
+8. Repeat steps 1-7 for the the corresponding NEG mode notebook.
+9. Move your output folder on Google Drive into the location indicated in column 'M' of the 'New Extraction' sheet in [Sample Tracking and QC Checkpoints - Northen Lab](https://docs.google.com/spreadsheets/d/126t1OeXQnCCgP6e-6Pac_Ku_A1R7MQLm_tl_Dkqsv_w/edit#gid=1548851545) Google Sheet.
+10. Email Katherine a link to the output folder so that she can review your outputs.
+
+
+### Perform FinalEMA-HILIC Analysis
+
+1. Follow the same steps as the ISTDsEtc analysis except use the notebook name `PROJECT_ID_FinalEMA-HILIC_POS.ipynb`.
+2. Open the `POS_PROJECT_ID_Final_Identifications.xlsx` file in the output directory on Google Drive.
+3. Make sure everything looks okay in the spreadsheet.
+4. If there are any compound-adduct pairs that need to be removed at this point (because they are duplicated or you can now determine a similar compound was a better match for a given peak), you can place 'REMOVE' in columns B, M, and N. In columns B and N you should also include some description such as 'REMOVE - duplicate' or 'REMOVE - other isomer prefered (tryptophan matches MSMS reference)' or 'REMOVE - other isomer prefered (tryptophan matches reference RT)'.
+5. If you are able to resolve some overlapping identifications at this point, then update the value in column B for the prefered match to no longer include the name of the molecule that is no longer considered a possible match.
+6. Repeat steps 1-5 for the corresponding NEG mode notebook.
+7. Move your output folder on Google Drive into the location indicated in column 'M' of the 'New Extraction' sheet in [Sample Tracking and QC Checkpoints - Northen Lab](https://docs.google.com/spreadsheets/d/126t1OeXQnCCgP6e-6Pac_Ku_A1R7MQLm_tl_Dkqsv_w/edit#gid=1548851545) Google Sheet.
+8. Email Katherine a link to the output folder so that she can review your outputs.
+
+## Using the Annotation GUI
+
+### Key Bindings
+
+`l` or right arrow - next compound-adduct pair
+`h' or left arrow - previous compound-adduct pair
+`k` or up arrow - next MSMS reference for this compound-adduct pair
+`j` or down arrow - previous MSMS reference for this compound-adduct pair
+`x` - sets the remove radio button
+`m` - steps through the similar compound-adduct pairs and matches the RT bounds to those of the similar compound-adduct pair
+`z` - steps though zoom levels of 1x, 5x, 25x on the MSMS mirror plot by 5
+`s` - toggles on/off the red and blue shading of EIC plot that show RT ranges for similar compounds

From c03984957afe97315ce409eeb084e845e67d509a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 3 Sep 2021 22:14:54 -0700
Subject: [PATCH 126/177] fix Targeted Analysis docs

---
 docs/Targeted_Analysis.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
index 953cec96..bf9be59d 100644
--- a/docs/Targeted_Analysis.md
+++ b/docs/Targeted_Analysis.md
@@ -26,7 +26,7 @@ On Cori run:
 ```
 cd
 git clone https://github.com/biorack/metatlas.git
-cd metatlas
+cd ~/metatlas
 git checkout oo_mads3
 ```
 
@@ -101,10 +101,17 @@ Once the job has completed, you should check the files generated to make sure th
 ### Key Bindings
 
 `l` or right arrow - next compound-adduct pair
-`h' or left arrow - previous compound-adduct pair
+
+`h` or left arrow - previous compound-adduct pair
+
 `k` or up arrow - next MSMS reference for this compound-adduct pair
+
 `j` or down arrow - previous MSMS reference for this compound-adduct pair
+
 `x` - sets the remove radio button
+
 `m` - steps through the similar compound-adduct pairs and matches the RT bounds to those of the similar compound-adduct pair
+
 `z` - steps though zoom levels of 1x, 5x, 25x on the MSMS mirror plot by 5
+
 `s` - toggles on/off the red and blue shading of EIC plot that show RT ranges for similar compounds

From 34099efebedffc7e575d3a9acc8f2a1f4feca776 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 7 Sep 2021 14:15:24 -0700
Subject: [PATCH 127/177] fix launch_rt_predict to copy kernel

---
 papermill/launch_rt_prediction.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papermill/launch_rt_prediction.sh b/papermill/launch_rt_prediction.sh
index 4ec2225b..a85c9e93 100755
--- a/papermill/launch_rt_prediction.sh
+++ b/papermill/launch_rt_prediction.sh
@@ -14,7 +14,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR="$(dirname "$SCRIPT_DIR")"
 EXP_DIR="${PROJECT_DIR}/$EXP"
 ANALYSIS_DIR="${EXP_DIR}/${USER}${ANALYSIS_NUM}"
-KERNEL_SOURCE="${SCRIPT_DIR}/notebooks/kernels/metatlas-targeted.kernel.json"
+KERNEL_SOURCE="${REPO_DIR}/notebooks/kernels/metatlas-targeted.kernel.json"
 KERNEL_DESTINATION="${HOME}/.local/share/jupyter/kernels/metatlas-targeted/kernel.json"
 
 IFS='_' read -ra TOKENS <<< "$EXP"

From eee226fd759a6bf92665786ef611888e591d95a1 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 7 Sep 2021 14:37:42 -0700
Subject: [PATCH 128/177] load python module in slurm script

---
 papermill/slurm_template.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/papermill/slurm_template.sh b/papermill/slurm_template.sh
index 388d183e..d42cf02c 100755
--- a/papermill/slurm_template.sh
+++ b/papermill/slurm_template.sh
@@ -15,6 +15,7 @@ CONDA_DIR="$(dirname "$(dirname "$(grep 'metatlas-targeted' ../notebooks/kernels
 date
 echo "input file: $IN_FILE"
 echo "output file: $OUT_FILE"
+module load python/3.8-anaconda-2020.11
 eval "$(conda shell.bash hook)"
 conda activate "$CONDA_DIR"
 

From 5fe9537c94a9b3f824809efb71e4e4563bfe6840 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 7 Sep 2021 14:43:21 -0700
Subject: [PATCH 129/177] fixes to Targeted Analysis documentation

---
 docs/Targeted_Analysis.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
index bf9be59d..11db65a5 100644
--- a/docs/Targeted_Analysis.md
+++ b/docs/Targeted_Analysis.md
@@ -5,11 +5,19 @@
 ### RClone configuration
 
 #### For MacOS/Linux
-Open a terminal and run:
+
+Open a terminal on Cori and run:
+`
+mkdir -p ~/.config/rclone
+`
+
+Open a terminal on your local machine and run:
 ```
-curl https://rclone.org/install.sh | sudo bash
+curl --silent --show-error https://rclone.org/install.sh | sudo -k bash > /dev/null
+# You will be prompted to enter your password, this allows the installation of rclone
 rclone config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0dms
-scp $(rclone config file | tail -1) \ dtn01.nersc.gov:~/.config/rclone/rclone.config
+# You will be prompted in your web browser to grant rclone access to Google Drive
+scp $(rclone config file | tail -1) dtn01.nersc.gov:~/.config/rclone/rclone.config
 ```
 
 #### For Windows
@@ -43,7 +51,7 @@ mkdir -p ~/metabolomics_data
 
 On Cori run:
 ```
-cd metatlas
+cd ~/metatlas
 git pull
 ```
 

From e3128eed833bf6153cd48cdde6bcbd1e15b7f406 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 7 Sep 2021 16:00:09 -0700
Subject: [PATCH 130/177] Add to Targeted Analysis documentation

---
 docs/Targeted_Analysis.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
index 11db65a5..0daad284 100644
--- a/docs/Targeted_Analysis.md
+++ b/docs/Targeted_Analysis.md
@@ -13,7 +13,7 @@ mkdir -p ~/.config/rclone
 
 Open a terminal on your local machine and run:
 ```
-curl --silent --show-error https://rclone.org/install.sh | sudo -k bash > /dev/null
+curl --silent --show-error https://rclone.org/install.sh | sudo bash > /dev/null
 # You will be prompted to enter your password, this allows the installation of rclone
 rclone config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0dms
 # You will be prompted in your web browser to grant rclone access to Google Drive
@@ -76,7 +76,7 @@ This will submit a slurm job. You will receive an email when the job starts exec
 
 #### Outputs
 
-Once the job has completed, you should check the files generated to make sure the RT correction models look acceptable. You can find the output files at `~/metabolomics_data/EXPERIMENT_NAME/${USER}ANALYSIS_NUMBER/`.
+Once the job has completed, you should check the files generated to make sure the RT correction models look acceptable. You can find the output files at `~/metabolomics_data/EXPERIMENT_NAME/${USER}ANALYSIS_NUMBER/`. One easy way to view these files is to open them from the [jupyter](https://jupyter.nersc.gov/) file browser.
 
 
 ### Perform ISTDsEtc Analysis

From 259e6cfcb1b9f84b5290da7e86d4642faa4f3b6c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 7 Sep 2021 17:28:11 -0700
Subject: [PATCH 131/177] allow gui to be run more than once

---
 metatlas/plots/dill2plots.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index a1953006..a1a186a2 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -291,6 +291,8 @@ def __init__(self,
                                     'keymap.xscale': ['k'],
                                     'keymap.save': ['s'],
                                     'keymap.home': ['h']})
+
+        adjust_rt_for_selected_compound.disable()
         # Turn On interactive plot
         plt.ion()
         self.layout_figure()

From 85335a81121c405d6de64a5054e88e873ab6db76 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 08:17:17 -0700
Subject: [PATCH 132/177] allow empyt ms2_notes at Final Identification

---
 metatlas/tools/fastanalysis.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 13eb2583..17c043e3 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -248,7 +248,10 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         final_df.loc[compound_idx, 'identification_notes'] = cid.identification_notes
         final_df.loc[compound_idx, 'ms1_notes'] = cid.ms1_notes
         final_df.loc[compound_idx, 'ms2_notes'] = cid.ms2_notes
-        final_df.loc[compound_idx, 'msms_quality'] = int(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
+        try:
+            final_df.loc[compound_idx, 'msms_quality'] = int(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
+        except ValueError:
+            final_df.loc[compound_idx, 'msms_quality'] = ''
         scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
         final_df.loc[compound_idx, 'total_score'] = sum([x if x != '' else 0 for x in scores])
         if final_df.loc[compound_idx, 'msms_quality'] == -1:

From dcc6c4884567892fae52c063999a76e4823cd65a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 09:16:29 -0700
Subject: [PATCH 133/177] fix Final Identifications for ISTDsEtc

---
 metatlas/tools/fastanalysis.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 17c043e3..46288fc2 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -253,15 +253,19 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         except ValueError:
             final_df.loc[compound_idx, 'msms_quality'] = ''
         scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
-        final_df.loc[compound_idx, 'total_score'] = sum([x if x != '' else 0 for x in scores])
-        if final_df.loc[compound_idx, 'msms_quality'] == -1:
-            final_df.loc[compound_idx, 'msi_level'] = "REMOVE, INVALIDATED BY BAD MSMS MATCH"
-        elif statistics.median(scores) < 1:
-            final_df.loc[compound_idx, 'msi_level'] = "putative"
-        elif sum(scores) == 3:
-            final_df.loc[compound_idx, 'msi_level'] = "Exceeds Level 1"
+        if all(isinstance(x, (int, float)) for x in scores):
+            final_df.loc[compound_idx, 'total_score'] = sum(scores)
+            if final_df.loc[compound_idx, 'msms_quality'] == -1:
+                final_df.loc[compound_idx, 'msi_level'] = "REMOVE, INVALIDATED BY BAD MSMS MATCH"
+            elif statistics.median(scores) < 1:
+                final_df.loc[compound_idx, 'msi_level'] = "putative"
+            elif sum(scores) == 3:
+                final_df.loc[compound_idx, 'msi_level'] = "Exceeds Level 1"
+            else:
+                final_df.loc[compound_idx, 'msi_level'] = "Level 1"
         else:
-            final_df.loc[compound_idx, 'msi_level'] = "Level 1"
+            final_df.loc[compound_idx, 'total_score'] = ""
+            final_df.loc[compound_idx, 'msi_level'] = ""
         if len(intensities) > 0:
             final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[intensities['intensity'].idxmax()]['intensity']
             max_intensity_file_id = int(intensities.loc[intensities['intensity'].idxmax()]['file_id'])

From d3a5072c4f0d544744b3e0bda8f59c422a84f5fa Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 13:46:43 -0700
Subject: [PATCH 134/177] add username parameter to AnalysisId

---
 metatlas/datastructures/metatlas_dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 8b5331e2..370840c1 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -1157,6 +1157,7 @@ def pre_annotation(
     num_points: int,
     peak_height: float,
     max_cpus: int,
+    username: Username = None,
 ) -> MetatlasDataset:
     """All data processing that needs to occur before the annotation GUI in Targeted notebook"""
     ids = AnalysisIdentifiers(
@@ -1169,6 +1170,7 @@ def pre_annotation(
         google_folder=google_folder,
         groups_controlled_vocab=groups_controlled_vocab,
         exclude_files=exclude_files,
+        username=getpass.getuser() if username is None else username
     )
     metatlas_dataset = MetatlasDataset(ids=ids, max_cpus=max_cpus)
     if metatlas_dataset.ids.output_type in ["FinalEMA-HILIC"]:

From 0beb16fa7d0947ed47ca49313581ca46db3bdd4d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 14:17:05 -0700
Subject: [PATCH 135/177] No error on repeated export of same atlas

---
 metatlas/io/targeted_output.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 14a5f1fe..718a7386 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -26,9 +26,8 @@ def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
         metatlas_dataset.ids.output_dir,
         f"{metatlas_dataset.atlas.name}_export.csv",
     )
-    write_utils.check_existing_file(export_atlas_filename, overwrite)
-    dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)
-    logger.info("Exported atlas to file: %s.", export_atlas_filename)
+    atlas_export_df = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas)
+    write_utils.export_dataframe(atlas_export_df, export_atlas_filename, "atlas", overwrite)
 
 
 def write_stats_table(

From 0b909842639692660b232f55b5ba7768c48ef571 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 15:40:38 -0700
Subject: [PATCH 136/177] fix no error on repeated export of same atlas

---
 metatlas/io/targeted_output.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 718a7386..837be55b 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -22,12 +22,9 @@
 
 def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
     """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
-    export_atlas_filename = os.path.join(
-        metatlas_dataset.ids.output_dir,
-        f"{metatlas_dataset.atlas.name}_export.csv",
-    )
-    atlas_export_df = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas)
-    write_utils.export_dataframe(atlas_export_df, export_atlas_filename, "atlas", overwrite)
+    out_file_name = os.path.join(metatlas_dataset.ids.output_dir, f"{metatlas_dataset.atlas.name}_export.csv")
+    out_df = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas)
+    write_utils.export_dataframe_die_on_diff(out_df, out_file_name, "atlas", overwrite=overwrite)
 
 
 def write_stats_table(

From 1cc11d0da8e08111a95d0ba5d5bcd1bc00c98698 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 15:44:48 -0700
Subject: [PATCH 137/177] add overwrite to export_dataframe_die_on_diff

---
 metatlas/io/write_utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index 7154f073..0c601f2f 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -67,18 +67,23 @@ def raise_on_diff(dataframe, file_path, description, **kwargs):
             raise
 
 
-def export_dataframe_die_on_diff(dataframe, file_path, description, **kwargs):
+def export_dataframe_die_on_diff(dataframe, file_path, description, overwrite=False, **kwargs):
     """
     inputs:
         dataframe: pandas DataFrame to save
         file_path: string with path of file to create
         description: free string for logging
+        overwrite: bool
         kwargs: passed through to to_csv()
 
+    If overwrite then save the dataframe to file_path
     If file_path does not exist then save the dataframe there
     If file_path exists and matches data in dataframe then do nothing
     If file_path exists and does not match dataframe then raise ValueError
     """
-    raise_on_diff(dataframe, file_path, description, **kwargs)
-    if not os.path.exists(file_path):
+    if overwrite:
         export_dataframe(dataframe, file_path, description, **kwargs)
+    else:
+        raise_on_diff(dataframe, file_path, description, **kwargs)
+        if not os.path.exists(file_path):
+            export_dataframe(dataframe, file_path, description, **kwargs)

From f98083b52d69e3cb14d65d03ca099a9d6276a67f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 17:03:50 -0700
Subject: [PATCH 138/177] use export_dataframe_die_on_diff for more outputs

---
 metatlas/io/targeted_output.py | 4 ++--
 metatlas/plots/dill2plots.py   | 4 +---
 metatlas/tools/fastanalysis.py | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 837be55b..01ba0c70 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -73,7 +73,7 @@ def write_stats_table(
         min_num_frag_matches,
         min_relative_frag_intensity,
     )
-    write_utils.export_dataframe(scores_df, scores_path, "scores", overwrite)
+    write_utils.export_dataframe_die_on_diff(scores_df, scores_path, "scores", overwrite=overwrite)
     fa.make_stats_table(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
@@ -200,7 +200,7 @@ def write_msms_fragment_ions(
         )
     out_df = pd.DataFrame(out)
     path = os.path.join(data.ids.output_dir, f"spectra_{intensity_fraction:.2f}pct_{int(min_mz)}cut.csv")
-    write_utils.export_dataframe(out_df, path, "MSMS fragment ions", overwrite)
+    write_utils.export_dataframe_die_on_diff(out_df, path, "MSMS fragment ions", overwrite=overwrite)
     return out_df
 
 
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index a1a186a2..9ff2a2b1 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1465,9 +1465,7 @@ def make_output_dataframe(input_fname='', input_dataset=None, include_lcmsruns=N
     if output_loc:
         prefix = f"{polarity}_" if polarity != '' else ''
         df_path = os.path.join(output_loc, f"{prefix}{fieldname}.tab")
-        write_utils.check_existing_file(df_path, overwrite)
-        out.to_csv(df_path, sep="\t")
-        logger.info('Exported %s to %s.', fieldname, df_path)
+        write_utils.export_dataframe_die_on_diff(out, df_path, fieldname, overwrite=overwrite, sep="\t")
     return out
 
 
diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 46288fc2..4be69b96 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -409,7 +409,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     if output_loc is not None:
         stats_tables_dir = os.path.join(output_loc, f"{prefix}stats_tables")
         stats_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.tab")
-        write_utils.export_dataframe(stats_table, stats_path, 'stats table', overwrite, sep='\t')
+        write_utils.export_dataframe_die_on_diff(stats_table, stats_path, 'stats table', overwrite, sep='\t')
         readme_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.readme")
         write_utils.check_existing_file(readme_path, overwrite)
         with open(readme_path, 'w') as readme:

From 92750b6a8eebadb8b7d43c4ce72874b5bf1056f9 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 8 Sep 2021 20:23:26 -0700
Subject: [PATCH 139/177] fix export_dataframe_die_on_diff overwrite=True

---
 metatlas/io/write_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index 0c601f2f..2845dffb 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -82,7 +82,7 @@ def export_dataframe_die_on_diff(dataframe, file_path, description, overwrite=Fa
     If file_path exists and does not match dataframe then raise ValueError
     """
     if overwrite:
-        export_dataframe(dataframe, file_path, description, **kwargs)
+        export_dataframe(dataframe, file_path, description, overwrite, **kwargs)
     else:
         raise_on_diff(dataframe, file_path, description, **kwargs)
         if not os.path.exists(file_path):

From 2d8923005f26d3b8007e9a0f40f3148edb4b0d36 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 9 Sep 2021 15:11:22 -0700
Subject: [PATCH 140/177] fix rclone setup instructions

---
 docs/Targeted_Analysis.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
index 0daad284..1350f867 100644
--- a/docs/Targeted_Analysis.md
+++ b/docs/Targeted_Analysis.md
@@ -17,7 +17,7 @@ curl --silent --show-error https://rclone.org/install.sh | sudo bash > /dev/null
 # You will be prompted to enter your password, this allows the installation of rclone
 rclone config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0dms
 # You will be prompted in your web browser to grant rclone access to Google Drive
-scp $(rclone config file | tail -1) dtn01.nersc.gov:~/.config/rclone/rclone.config
+scp $(rclone config file | tail -1) dtn01.nersc.gov:~/.config/rclone/rclone.conf
 ```
 
 #### For Windows

From 5d4c958ffa9d6dd6e5525089498c1bac88118473 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 13:55:13 -0700
Subject: [PATCH 141/177] fix scores overwrite in Final Identifications

---
 metatlas/tools/fastanalysis.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 4be69b96..c32fd2d7 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -252,14 +252,14 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'msms_quality'] = int(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
         except ValueError:
             final_df.loc[compound_idx, 'msms_quality'] = ''
-        scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
-        if all(isinstance(x, (int, float)) for x in scores):
-            final_df.loc[compound_idx, 'total_score'] = sum(scores)
+        quality_scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
+        if all(isinstance(x, (int, float)) for x in quality_scores):
+            final_df.loc[compound_idx, 'total_score'] = sum(quality_scores)
             if final_df.loc[compound_idx, 'msms_quality'] == -1:
                 final_df.loc[compound_idx, 'msi_level'] = "REMOVE, INVALIDATED BY BAD MSMS MATCH"
-            elif statistics.median(scores) < 1:
+            elif statistics.median(quality_scores) < 1:
                 final_df.loc[compound_idx, 'msi_level'] = "putative"
-            elif sum(scores) == 3:
+            elif sum(quality_scores) == 3:
                 final_df.loc[compound_idx, 'msi_level'] = "Exceeds Level 1"
             else:
                 final_df.loc[compound_idx, 'msi_level'] = "Level 1"

From c855d63ebb0eb6679c49b06161356623638b9975 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 15:24:41 -0700
Subject: [PATCH 142/177] make POS/NEG output folders

---
 metatlas/datastructures/metatlas_dataset.py | 5 ++++-
 tests/system/test_targeted.py               | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 370840c1..f984b389 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -253,7 +253,10 @@ def short_polarity_inverse(self) -> List[ShortPolarity]:
     @property
     def output_dir(self) -> PathString:
         """Creates the output directory and returns the path as a string"""
-        out = os.path.join(self.project_directory, self.experiment, self.analysis, self.output_type)
+        sub_dirs = [self.experiment, self.analysis, self.output_type]
+        if self.output_type != 'data_QC':
+            sub_dirs.append(self.short_polarity)
+        out = os.path.join(self.project_directory, *sub_dirs)
         os.makedirs(out, exist_ok=True)
         return PathString(out)
 
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index c9892908..7ae2dc72 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -8,7 +8,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     expected = {}
     expected[
-        str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_peak_height.tab")
+        str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_peak_height.tab")
     ] = "\n".join(
         [
             f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
@@ -22,7 +22,7 @@ def test_targeted_by_line01_with_remove(tmp_path):
             "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
         ]
     )
-    expected[str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS_data_sheets/POS_rt_peak.tab")] = "\n".join(
+    expected[str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_rt_peak.tab")] = "\n".join(
         [
             f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
             f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",

From 7531ee38742359dd126b26f7716b47acfb97639f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 20:46:17 -0700
Subject: [PATCH 143/177] Raise error if no matching h5 files are found

---
 metatlas/datastructures/metatlas_dataset.py | 24 +++++++++------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index f984b389..6a9df86c 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -614,19 +614,15 @@ def _clone_source_atlas(self) -> metob.Atlas:
     def _build(self) -> None:
         """Populate self._data from database and h5 files."""
         start_time = datetime.datetime.now()
-        files = []
-        for group in self.ids.groups:
-            for h5_file in group.items:
-                files.append(
-                    (
-                        h5_file,
-                        group,
-                        self.atlas_df,
-                        self.atlas,
-                        self.extra_time,
-                        self.extra_mz,
-                    )
-                )
+        files = [(h5_file, group, self.atlas_df, self.atlas, self.extra_time, self.extra_mz)
+                 for group in self.ids.groups
+                 for h5_file in group.items]
+        try:
+            if len(files) == 0:
+                raise ValueError('No matching h5 files were found')
+        except ValueError as err:
+            logger.exception(err)
+            raise err
         logger.info("Generating MetatlasDataset by reading MSMS data from h5 files")
         samples = parallel.parallel_process(
             ma_data.get_data_for_atlas_df_and_file, files, self.max_cpus, unit="sample", spread_args=False
@@ -1173,7 +1169,7 @@ def pre_annotation(
         google_folder=google_folder,
         groups_controlled_vocab=groups_controlled_vocab,
         exclude_files=exclude_files,
-        username=getpass.getuser() if username is None else username
+        username=getpass.getuser() if username is None else username,
     )
     metatlas_dataset = MetatlasDataset(ids=ids, max_cpus=max_cpus)
     if metatlas_dataset.ids.output_type in ["FinalEMA-HILIC"]:

From 11c6a6200645a1ba255c2f32edfb6c28c57de066 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 20:53:20 -0700
Subject: [PATCH 144/177] all ms2_notes in tests now start with number,

---
 tests/unit/conftest.py                          | 2 +-
 tests/unit/test_dill2plot.py                    | 2 +-
 tests/unit/test_metatlas_get_data_helper_fun.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 1f4978da..1c38c50d 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -957,7 +957,7 @@ def fixture_compound_identification(compound, rt_reference, mz_reference, userna
     ident.internal_standard_id = ""
     ident.do_normalization = False
     ident.identification_notes = "my id note"
-    ident.ms2_notes = "bad match to ref"
+    ident.ms2_notes = "-1,bad match to ref"
     ident.ms1_notes = "keep"
     ident.frag_references = []
     ident.intensity_references = []
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py
index 5d56ebeb..5e87947e 100644
--- a/tests/unit/test_dill2plot.py
+++ b/tests/unit/test_dill2plot.py
@@ -81,7 +81,7 @@ def test_export_atlas_to_spreadsheet(atlas, username):
     expected = (
         """{"chebi_id":{"0":"CHEBI:17256"},"chebi_url":{"0":"http://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:17256"},"creation_time":{"0":1466212395.0},"description":{"0":"A purine 2'-deoxyribonucleoside having adenine as the nucleobase."},"formula":{"0":"C10H13N5O3"},"head_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"hmdb_id":{"0":"HMDB00101"},"hmdb_url":{"0":"http://www.hmdb.ca/metabolites/HMDB00101"},"img_abc_id":{"0":""},"inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"iupac_name":{"0":""},"kegg_id":{"0":"C00559"},"kegg_url":{"0":"http://www.genome.jp/dbget-bin/www_bget?C00559"},"last_modified":{"0":1612996604.0},"lipidmaps_id":{"0":""},"lipidmaps_url":{"0":""},"metacyc_id":{"0":"DEOXYADENOSINE"},"mono_isotopic_molecular_weight":{"0":251.101839276},"name":{"0":"2'-deoxyadenosine"},"neutralized_2d_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)"},"neutralized_2d_inchi_key":{"0":"OLXZPDWKRNYJJZ-UHFFFAOYSA-N"},"neutralized_inchi":{"0":"InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"},"neutralized_inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N"},"num_free_radicals":{"0":0.0},"number_components":{"0":1.0},"permanent_charge":{"0":0.0},"prev_uid":{"0":"origin"},"pubchem_compound_id":{"0":"13730"},"pubchem_url":{"0":"http://pubchem.ncbi.nlm.nih.gov/compound/13730"},"source":{"0":"gnps///chebi///metacyc///hmdb"},"synonyms":{"0":"2'-deoxyadenosine"},"unique_id":{"0":"60cd6743e56545c6a6cb066ec3553450"},"username":{"0":"""  # noqa:  E501
         f'"{username}"'
-        """},"wikipedia_url":{"0":""},"label":{"0":"2'-deoxyadenosine"},"id_notes":{"0":"No description"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"bad match to ref"},"identification_notes":{"0":"my id note"},"rt_min":{"0":1.6964640054},"rt_max":{"0":2.6964640054},"rt_peak":{"0":2.1964640054},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"adduct":{"0":"[M+H]+"},"polarity":{"0":"positive"}}"""
+        """},"wikipedia_url":{"0":""},"label":{"0":"2'-deoxyadenosine"},"id_notes":{"0":"No description"},"ms1_notes":{"0":"keep"},"ms2_notes":{"0":"-1,bad match to ref"},"identification_notes":{"0":"my id note"},"rt_min":{"0":1.6964640054},"rt_max":{"0":2.6964640054},"rt_peak":{"0":2.1964640054},"mz":{"0":252.1091393},"mz_tolerance":{"0":20.0},"adduct":{"0":"[M+H]+"},"polarity":{"0":"positive"}}"""
     )  # noqa:  E501
     assert expected == dill2plots.export_atlas_to_spreadsheet(atlas).to_json().replace(r"\/", "/")
 
diff --git a/tests/unit/test_metatlas_get_data_helper_fun.py b/tests/unit/test_metatlas_get_data_helper_fun.py
index e5d5000b..f3483aba 100644
--- a/tests/unit/test_metatlas_get_data_helper_fun.py
+++ b/tests/unit/test_metatlas_get_data_helper_fun.py
@@ -8,7 +8,7 @@
 
 def test_make_atlas_df(atlas_with_2_cids):
     # pylint: disable=line-too-long
-    expected = """{"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","1":"OIRDTQYFTABQOQ-KQYNXXCUSA-N"},"compound_name":{"0":"2\'-deoxyadenosine","1":"adenosine"},"rt_max":{"0":2.6964640054,"1":3.523318408},"rt_min":{"0":1.6964640054,"1":2.523318408},"rt_peak":{"0":2.1964640054,"1":3.023318408},"rt_units":{"0":"min","1":"min"},"detected_polarity":{"0":"positive","1":"positive"},"mz":{"0":252.1091393,"1":268.1040539},"mz_tolerance":{"0":20.0,"1":20.0},"mz_tolerance_units":{"0":"ppm","1":"ppm"},"mono_isotopic_molecular_weight":{"0":251.101839276,"1":267.096753896},"pubchem_compound_id":{"0":"13730","1":"60961"},"synonyms":{"0":"2\'-deoxyadenosine","1":"adenosine\\/\\/\\/58-61-7\\/\\/\\/Adenocard\\/\\/\\/Adenoscan"},"inchi":{"0":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1","1":"InChI=1S\\/C10H13N5O4\\/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10\\/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)\\/t4-,6-,7-,10-\\/m1\\/s1"},"adduct":{"0":"[M+H]+","1":"[M+H]+"},"label":{"0":"2\'-deoxyadenosine","1":"adenosine"},"ms1_notes":{"0":"keep","1":""},"ms2_notes":{"0":"bad match to ref","1":""},"identification_notes":{"0":"my id note","1":""}}"""  # noqa: E501
+    expected = """{"inchi_key":{"0":"OLXZPDWKRNYJJZ-RRKCRQDMSA-N","1":"OIRDTQYFTABQOQ-KQYNXXCUSA-N"},"compound_name":{"0":"2\'-deoxyadenosine","1":"adenosine"},"rt_max":{"0":2.6964640054,"1":3.523318408},"rt_min":{"0":1.6964640054,"1":2.523318408},"rt_peak":{"0":2.1964640054,"1":3.023318408},"rt_units":{"0":"min","1":"min"},"detected_polarity":{"0":"positive","1":"positive"},"mz":{"0":252.1091393,"1":268.1040539},"mz_tolerance":{"0":20.0,"1":20.0},"mz_tolerance_units":{"0":"ppm","1":"ppm"},"mono_isotopic_molecular_weight":{"0":251.101839276,"1":267.096753896},"pubchem_compound_id":{"0":"13730","1":"60961"},"synonyms":{"0":"2\'-deoxyadenosine","1":"adenosine\\/\\/\\/58-61-7\\/\\/\\/Adenocard\\/\\/\\/Adenoscan"},"inchi":{"0":"InChI=1S\\/C10H13N5O3\\/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7\\/h3-7,16-17H,1-2H2,(H2,11,12,13)\\/t5-,6+,7+\\/m0\\/s1","1":"InChI=1S\\/C10H13N5O4\\/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10\\/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)\\/t4-,6-,7-,10-\\/m1\\/s1"},"adduct":{"0":"[M+H]+","1":"[M+H]+"},"label":{"0":"2\'-deoxyadenosine","1":"adenosine"},"ms1_notes":{"0":"keep","1":""},"ms2_notes":{"0":"-1,bad match to ref","1":""},"identification_notes":{"0":"my id note","1":""}}"""  # noqa: E501
     assert expected == gdhf.make_atlas_df(atlas_with_2_cids).to_json()
 
 
@@ -466,7 +466,7 @@ def test_get_data_for_atlas_df_and_file(lcmsrun, group, atlas_df, atlas, usernam
                 "internal_standard_to_use": "",
                 "last_modified": "2021-08-16T12:04:52",
                 "ms1_notes": "keep",
-                "ms2_notes": "bad match to ref",
+                "ms2_notes": "-1,bad match to ref",
                 "mz_references": [
                     {
                         "adduct": "[M+H]+",

From e8988b73ee307e61401033c6abccfe16a687591e Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 20:58:12 -0700
Subject: [PATCH 145/177] add POS+NEG nonQC h5 files to ci02 docker image

---
 docker/Dockerfile.ci02                  |  9 +++++++++
 docker/rt_predict_test_case_from_db.sql | 17 ++++++++++++++++-
 tests/system/test_rt_predict.py         |  4 ++--
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci02 b/docker/Dockerfile.ci02
index c3c3ed52..6471a315 100644
--- a/docker/Dockerfile.ci02
+++ b/docker/Dockerfile.ci02
@@ -17,12 +17,21 @@ ADD https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 /usr/loca
 RUN chmod +x /usr/local/bin/jq
 
 RUN mkdir -p /io /src /work $REFS_DIR $H5_DIR
+ADD $BASE_DATA_URL/msms_refs_v3.tab $REFS_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_53_Cone-S1_5_Rg70to1050-CE102040-QlobataAkingi-S1_Run188.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run41.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_58_Cone-S2_2_Rg70to1050-CE102040-QlobataAkingi-S1_Run56.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_59_Cone-S2_3_Rg70to1050-CE102040-QlobataAkingi-S1_Run87.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_53_Cone-S1_5_Rg70to1050-CE102040-QlobataAkingi-S1_Run187.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_54_Cone-S1_6_Rg70to1050-CE102040-QlobataAkingi-S1_Run221.h5 $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5 $H5_DIR/
 
 COPY requirements.txt /requirements.txt
 RUN pip install --quiet -r requirements.txt
diff --git a/docker/rt_predict_test_case_from_db.sql b/docker/rt_predict_test_case_from_db.sql
index f13dfb0e..44d9122e 100644
--- a/docker/rt_predict_test_case_from_db.sql
+++ b/docker/rt_predict_test_case_from_db.sql
@@ -55,7 +55,22 @@ LEFT JOIN (
         JOIN (
                 SELECT MAX(creation_time) AS ctime, hdf5_file
                 FROM lcmsruns
-                WHERE (name LIKE '20201106\_JGI-AK\_PS-KM\_505892\_OakGall\_final\_QE-HF\_HILICZ\_USHXG01583\_%\_QC\_%')
+                WHERE name in (
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run309.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run8.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_53_Cone-S1_5_Rg70to1050-CE102040-QlobataAkingi-S1_Run187.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run41.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_53_Cone-S1_5_Rg70to1050-CE102040-QlobataAkingi-S1_Run188.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_54_Cone-S1_6_Rg70to1050-CE102040-QlobataAkingi-S1_Run221.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_58_Cone-S2_2_Rg70to1050-CE102040-QlobataAkingi-S1_Run56.mzML',
+			'20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_NEG_MSMS_59_Cone-S2_3_Rg70to1050-CE102040-QlobataAkingi-S1_Run87.mzML'
+		)
                 GROUP BY hdf5_file
         ) AS early
         ON l1.creation_time=early.ctime AND l1.hdf5_file=early.hdf5_file
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index ec39b703..01ad7e9d 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -3,8 +3,8 @@
 from . import utils
 
 
-def test_targeted_by_line01_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci02:v1.3.5"
+def test_rt_predict_by_line01(tmp_path):
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci02:v1.4.6"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     expected = {}
     expected[

From 1df85cfdde96baecccb89b127907b96f70fd7992 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 10 Sep 2021 21:06:44 -0700
Subject: [PATCH 146/177] Small reorg of RT_predict notebook

---
 metatlas/tools/predict_rt.py            | 40 +++++++++++++++++--------
 notebooks/reference/RT_Prediction.ipynb |  2 +-
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index 7156d514..bf350ed1 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -90,20 +90,13 @@ def predict(self, x_values):
         return self.sk_model.predict(x_transformed)
 
 
-def generate_rt_correction_models(
-    ids, cpus, repo_dir, save_to_db=True, use_poly_model=True, model_only=False
-):
+def generate_rt_correction_models(ids: mads.AnalysisIdentifiers, cpus: int) -> (Model, Model):
     """
-    Generate the RT correction models and associated atlases with adjusted RT values
+    Generate the RT correction models and model charaterization files
     inputs:
         ids: an AnalysisIds object matching the one used in the main notebook
         cpus: max number of cpus to use
-        repo_dir: location of metatlas git repo on local filesystem
-        save_to_db: If True, save the new atlases to the database
-        use_poly_model: If True, use the polynomial model, else use linear model
-                        Both types of models are always generated, this only determines which ones
-                        are pre-populated into the generated notebooks
-        model_only: If True, do not create atlases or notebooks, if False create them
+    Returns a tuple with a linear and polynomial model
     """
     # pylint: disable=too-many-locals
     groups = get_groups(ids)
@@ -111,9 +104,12 @@ def generate_rt_correction_models(
     qc_atlas, qc_atlas_df = get_qc_atlas(ids)
     # this metatlas_dataset is not a class instance. Only has metatlas_dataset[file_idx][compound_idx]...
     metatlas_dataset = load_runs(files_df, qc_atlas_df, qc_atlas, cpus)
-    if len(metatlas_dataset) == 0:
-        logger.error("No matching LCMS runs, terminating without generating outputs.")
-        return
+    try:
+        if len(metatlas_dataset) == 0:
+            raise ValueError("No matching LCMS runs, terminating without generating outputs.")
+    except ValueError as err:
+        logger.exception(err)
+        raise err
     save_rt_peak(metatlas_dataset, os.path.join(ids.output_dir, "rt_peak.tab"))
     save_measured_rts(metatlas_dataset, os.path.join(ids.output_dir, "QC_Measured_RTs.csv"))
     rts_df = get_rts(metatlas_dataset)
@@ -129,6 +125,24 @@ def generate_rt_correction_models(
     save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, rt_comparison_file_name)
     models_file_name = os.path.join(ids.output_dir, "rt_model.txt")
     write_models(models_file_name, linear, poly, groups, qc_atlas)
+    return (linear, poly)
+
+
+def generate_outputs(ids, cpus, repo_dir, save_to_db=True, use_poly_model=True, model_only=False):
+    """
+    Generate the RT correction models, associated atlases with adjusted RT values, follow up notebooks,
+    msms hits pickles
+    inputs:
+        ids: an AnalysisIds object matching the one used in the main notebook
+        cpus: max number of cpus to use
+        repo_dir: location of metatlas git repo on local filesystem
+        save_to_db: If True, save the new atlases to the database
+        use_poly_model: If True, use the polynomial model, else use linear model
+                        Both types of models are always generated, this only determines which ones
+                        are pre-populated into the generated notebooks
+        model_only: If True, do not create atlases or notebooks, if False create them
+    """
+    linear, poly = generate_rt_correction_models(ids, cpus)
     if not model_only:
         atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
         write_notebooks(ids, atlases, repo_dir, use_poly_model)
diff --git a/notebooks/reference/RT_Prediction.ipynb b/notebooks/reference/RT_Prediction.ipynb
index 3184e04e..61edecb0 100644
--- a/notebooks/reference/RT_Prediction.ipynb
+++ b/notebooks/reference/RT_Prediction.ipynb
@@ -125,7 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)"
+    "predict_rt.generate_outputs(ids, max_cpus, metatlas_repo_path)"
    ]
   }
  ],

From 517a95dccac5ea4c933436ddd258327eaec75192 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 13 Sep 2021 11:36:33 -0700
Subject: [PATCH 147/177]  Add caching of msms hits

 RT predict notebook and slurm job will now generate and cache
 the msms hits, which will automatically be loaded into the
 corresponding targeted analysis notebooks.
---
 metatlas/datastructures/metatlas_dataset.py | 96 ++++++++++++++++-----
 metatlas/tools/predict_rt.py                | 32 ++++++-
 papermill/slurm_template.sh                 |  2 +-
 tests/system/test_rt_predict.py             |  2 +-
 tests/system/test_targeted.py               |  6 +-
 tests/unit/test_metatlas_dataset.py         | 37 ++++++++
 6 files changed, 149 insertions(+), 26 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 6a9df86c..b8b795a0 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -6,7 +6,9 @@
 import glob
 import logging
 import os
+import pickle
 import shutil
+import uuid
 
 from typing import cast, Any, Dict, List, NewType, Optional, Tuple, TypedDict, Union
 
@@ -254,12 +256,19 @@ def short_polarity_inverse(self) -> List[ShortPolarity]:
     def output_dir(self) -> PathString:
         """Creates the output directory and returns the path as a string"""
         sub_dirs = [self.experiment, self.analysis, self.output_type]
-        if self.output_type != 'data_QC':
+        if self.output_type != "data_QC":
             sub_dirs.append(self.short_polarity)
         out = os.path.join(self.project_directory, *sub_dirs)
         os.makedirs(out, exist_ok=True)
         return PathString(out)
 
+    @property
+    def cache_dir(self) -> PathString:
+        """Creates directory for storing cache files and returns the path as a string"""
+        out = os.path.join(self.project_directory, self.experiment, "cache")
+        os.makedirs(out, exist_ok=True)
+        return PathString(out)
+
     @property
     def lcmsruns(self) -> List[metob.LcmsRun]:
         """Get LCMS runs from DB matching experiment"""
@@ -541,6 +550,34 @@ def __init__(self, **kwargs) -> None:
             self.write_data_source_files()
             self.ids.write_lcmsruns_short_names()
 
+    def _save_to_cache(self, data: Any, metadata: dict) -> None:
+        assert "_variable_name" in metadata.keys()
+        metadata = metadata.copy()
+        name = metadata["_variable_name"]
+        base_name = f"{name}_{uuid.uuid4()}"
+        metadata["_pickle_file_name"] = os.path.join(self.ids.cache_dir, f"{base_name}.pkl")
+        metadata_file_name = os.path.join(self.ids.cache_dir, f"{base_name}.metadata")
+        with open(metadata["_pickle_file_name"], "wb") as pickle_fh:
+            pickle.dump(data, pickle_fh)
+        with open(metadata_file_name, "wb") as metadata_fh:
+            pickle.dump(metadata, metadata_fh)
+        logger.info("Caching %s in %s.", name, metadata["_pickle_file_name"])
+
+    def _query_cache(self, required_metadata: dict) -> Optional[Any]:
+        assert "_variable_name" in required_metadata.keys()
+        name = required_metadata["_variable_name"]
+        for metadata_file in glob.glob(os.path.join(self.ids.cache_dir, f"{name}_*.metadata")):
+            with open(metadata_file, "rb") as metadata_fh:
+                potential_metadata = pickle.load(metadata_fh)
+                pickle_file_name = potential_metadata["_pickle_file_name"]
+                # require_metadata does not have a '_pickle_file_name' key, so remove before equality test
+                del potential_metadata["_pickle_file_name"]
+                if required_metadata == potential_metadata:
+                    with open(pickle_file_name, "rb") as pickle_fh:
+                        logger.info("Loading cached %s from %s.", name, pickle_file_name)
+                        return pickle.load(pickle_fh)
+        return None
+
     def write_data_source_files(self) -> None:
         """Write the data source files if they don't already exist"""
         data_sources_dir = os.path.join(self.ids.output_dir, f"{self.ids.short_polarity}_data_sources")
@@ -614,12 +651,14 @@ def _clone_source_atlas(self) -> metob.Atlas:
     def _build(self) -> None:
         """Populate self._data from database and h5 files."""
         start_time = datetime.datetime.now()
-        files = [(h5_file, group, self.atlas_df, self.atlas, self.extra_time, self.extra_mz)
-                 for group in self.ids.groups
-                 for h5_file in group.items]
+        files = [
+            (h5_file, group, self.atlas_df, self.atlas, self.extra_time, self.extra_mz)
+            for group in self.ids.groups
+            for h5_file in group.items
+        ]
         try:
             if len(files) == 0:
-                raise ValueError('No matching h5 files were found')
+                raise ValueError("No matching h5 files were found")
         except ValueError as err:
             logger.exception(err)
             raise err
@@ -871,22 +910,37 @@ def hits(self) -> pd.DataFrame:
         _ = self.atlas_df  # regenerate if needed before logging hits generation
         _ = self.data  # regenerate if needed before logging hits generation
         if self._hits is None:
-            logger.info(
-                "Generating hits with extra_time=%.3f, frag_mz_tolerance=%.4f, msms_refs_loc=%s.",
-                self.extra_time,
-                self.frag_mz_tolerance,
-                self.msms_refs_loc,
-            )
-            start_time = datetime.datetime.now()
-            self._hits = dp.get_msms_hits(
-                self.data,
-                extra_time=self.extra_time > 0,
-                keep_nonmatches=self.keep_nonmatches,
-                frag_mz_tolerance=self.frag_mz_tolerance,
-                ref_loc=self.msms_refs_loc,
-            )
-            logger.info("Generated %d hits in %s.", len(self._hits), _duration_since(start_time))
-            self._hits_valid_for_rt_bounds = True
+            metadata = {
+                "_variable_name": "hits",
+                "polarity": self.ids.polarity,
+                "extra_time": self.extra_time,
+                "keep_nonmatches": self.keep_nonmatches,
+                "frag_mz_tolerance": self.frag_mz_tolerance,
+                "ref_loc": self.msms_refs_loc,
+                "extra_mz": self.extra_mz,
+                "output_type": self.ids.output_type,
+            }
+            self._hits = self._query_cache(metadata)
+            if self._hits is None:
+                logger.info(
+                    "Generating hits with extra_time=%.3f, frag_mz_tolerance=%.4f, msms_refs_loc=%s.",
+                    self.extra_time,
+                    self.frag_mz_tolerance,
+                    self.msms_refs_loc,
+                )
+                start_time = datetime.datetime.now()
+                self._hits = dp.get_msms_hits(
+                    self.data,
+                    extra_time=self.extra_time > 0,
+                    keep_nonmatches=self.keep_nonmatches,
+                    frag_mz_tolerance=self.frag_mz_tolerance,
+                    ref_loc=self.msms_refs_loc,
+                )
+                logger.info("Generated %d hits in %s.", len(self._hits), _duration_since(start_time))
+                self._hits_valid_for_rt_bounds = True
+                self._save_to_cache(self._hits, metadata)
+            else:
+                self._hits_valid_for_rt_bounds = False  # unsure, so assume False
         return self._hits
 
     def __len__(self) -> int:
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index bf350ed1..a095f78e 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -8,6 +8,7 @@
 
 from datetime import datetime
 from pathlib import Path
+from typing import Tuple
 
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
@@ -90,7 +91,7 @@ def predict(self, x_values):
         return self.sk_model.predict(x_transformed)
 
 
-def generate_rt_correction_models(ids: mads.AnalysisIdentifiers, cpus: int) -> (Model, Model):
+def generate_rt_correction_models(ids: mads.AnalysisIdentifiers, cpus: int) -> Tuple[Model, Model]:
     """
     Generate the RT correction models and model charaterization files
     inputs:
@@ -146,11 +147,40 @@ def generate_outputs(ids, cpus, repo_dir, save_to_db=True, use_poly_model=True,
     if not model_only:
         atlases = create_adjusted_atlases(linear, poly, ids, save_to_db=save_to_db)
         write_notebooks(ids, atlases, repo_dir, use_poly_model)
+        get_msms_hits_for_all_notebooks(ids, atlases, cpus, use_poly_model)
     targeted_output.copy_outputs_to_google_drive(ids)
     targeted_output.archive_outputs(ids)
     logger.info("RT correction notebook complete. Switch to Targeted notebook to continue.")
 
 
+def get_msms_hits_for_all_notebooks(ids, atlases, cpus, use_poly_model):
+    """
+    inputs:
+        ids: an AnalysisIds object matching the one used in the main notebook
+        atlases: list of atlas names to consider generating hits for
+        cpus: max number of cpus to use
+        use_poly_model: If True, use the polynomial model, else use linear model
+                        Both types of models are always generated, this only determines which ones
+                        are pre-populated into the generated notebooks
+    Calls MetatlasDataset().hits, which will create a hits cache file
+    """
+    for atlas_name in atlases:
+        if (use_poly_model and "linear" in atlas_name) or (not use_poly_model and "polynomial" in atlas_name):
+            continue
+        polarity = "positive" if "_POS_" in atlas_name else "negative"
+        output_type = "FinalEMA-HILIC" if "EMA_Unlab" in atlas_name else "ISTDsEtc"
+        current_ids = mads.AnalysisIdentifiers(
+            source_atlas=atlas_name,
+            experiment=ids.experiment,
+            output_type=output_type,
+            polarity=polarity,
+            analysis_number=ids.analysis_number,
+            project_directory=ids.project_directory,
+            google_folder=ids.google_folder,
+        )
+        _ = mads.MetatlasDataset(ids=current_ids, max_cpus=cpus).hits
+
+
 def get_groups(ids):
     """
     Create all experiment groups if they don't already exist and return the subset matching include_list
diff --git a/papermill/slurm_template.sh b/papermill/slurm_template.sh
index d42cf02c..d800daba 100755
--- a/papermill/slurm_template.sh
+++ b/papermill/slurm_template.sh
@@ -4,7 +4,7 @@
 #SBATCH --account=gtrnd
 #SBATCH --qos=genepool
 #SBATCH --mail-type=ALL
-#SBATCH -t 02:00:00
+#SBATCH -t 12:00:00
 
 #OpenMP settings:
 export OMP_NUM_THREADS=1
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index 01ad7e9d..61f9ca57 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -184,5 +184,5 @@ def test_rt_predict_by_line01(tmp_path):
                         /out/Remove-done.ipynb
                    """
     utils.exec_docker(image, command, tmp_path)
-    assert utils.num_files_in(tmp_path) == 9
+    assert utils.num_files_in(tmp_path) == 45
     utils.assert_files_match(expected)
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index 7ae2dc72..50a5a6df 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -22,7 +22,9 @@ def test_targeted_by_line01_with_remove(tmp_path):
             "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
         ]
     )
-    expected[str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_rt_peak.tab")] = "\n".join(
+    expected[
+        str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_rt_peak.tab")
+    ] = "\n".join(
         [
             f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
             f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
@@ -63,5 +65,5 @@ def test_targeted_by_line01_with_remove(tmp_path):
                         /out/Remove-done.ipynb
                    """
     utils.exec_docker(image, command, tmp_path)
-    assert utils.num_files_in(tmp_path) == 39
+    assert utils.num_files_in(tmp_path) == 43
     utils.assert_files_match(expected)
diff --git a/tests/unit/test_metatlas_dataset.py b/tests/unit/test_metatlas_dataset.py
index 58b0b8d4..189730e3 100644
--- a/tests/unit/test_metatlas_dataset.py
+++ b/tests/unit/test_metatlas_dataset.py
@@ -704,3 +704,40 @@ def test_has_selection01():
     assert not mads._has_selection("")
     assert not mads._has_selection("no selection")
     assert not mads._has_selection("NO Selection")
+
+
+def test_cache_dir(metatlas_dataset):
+    assert os.path.isdir(metatlas_dataset.ids.cache_dir)
+
+
+def test_cache01(metatlas_dataset):
+    data = "foobar"
+    metadata = {"_variable_name": "test_var"}
+    metatlas_dataset._save_to_cache(data, metadata)
+    assert metatlas_dataset._query_cache(metadata) == data
+
+
+def test_cache02(metatlas_dataset):
+    metadata = {"_variable_name": "test_var", "foo": "bar"}
+    metatlas_dataset._save_to_cache("", metadata)
+    metadata["new_key"] = ""
+    assert metatlas_dataset._query_cache(metadata) is None
+    del metadata["new_key"]
+    metadata["foo"] = "zoop"
+    assert metatlas_dataset._query_cache(metadata) is None
+    del metadata["foo"]
+    assert metatlas_dataset._query_cache(metadata) is None
+
+
+def test_save_to_cache(metatlas_dataset):
+    with pytest.raises(AssertionError):
+        metatlas_dataset._save_to_cache("", {})
+
+
+def test_query_cache01(metatlas_dataset):
+    with pytest.raises(AssertionError):
+        metatlas_dataset._query_cache({})
+
+
+def test_query_cache02(metatlas_dataset):
+    assert metatlas_dataset._query_cache({"_variable_name": "foobar"}) is None

From 24b97034c17c76a06dbc965952105d75b64067b9 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 13 Sep 2021 19:57:17 -0700
Subject: [PATCH 148/177] Include polarity in tar.gz output file

---
 metatlas/io/targeted_output.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 01ba0c70..c079c7cc 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -291,7 +291,8 @@ def archive_outputs(ids):
         ids: an AnalysisIds object
     """
     logger.info("Generating archive of output files.")
-    output_file = f"{ids.short_experiment_analysis}.tar.gz"
+    suffix = "" if ids.output_type == 'data_QC' else f"-{ids.short_polarity}"
+    output_file = f"{ids.short_experiment_analysis}{suffix}.tar.gz"
     output_path = os.path.join(ids.project_directory, ids.experiment, output_file)
     with tarfile.open(output_path, "w:gz") as tar:
         tar.add(ids.output_dir, arcname=os.path.basename(ids.output_dir))

From d74134405ca73124b659e1a148f8c49231a35516 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 13 Sep 2021 21:15:11 -0700
Subject: [PATCH 149/177] add short polarity to rclone dest path

---
 metatlas/io/targeted_output.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index c079c7cc..9cf34d0c 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -315,6 +315,9 @@ def copy_outputs_to_google_drive(ids):
     if drive is None:
         logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
         return
-    sub_folder = os.path.join("Analysis_uploads", ids.experiment, ids.analysis, ids.output_type)
+    folders = [ids.experiment, ids.analysis, ids.output_type]
+    if ids.output_type != 'data_QC':
+        folders.append(ids.short_polarity)
+    sub_folder = os.path.join("Analysis_uploads", *folders)
     rci.copy_to_drive(ids.output_dir, drive, sub_folder)
     logger.info("Done copying output files to Google Drive")

From 43948cc80f30d1ae5195853a363845f817380d88 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 14 Sep 2021 10:05:49 -0700
Subject: [PATCH 150/177] show Google Drive url after upload

---
 metatlas/io/rclone.py           | 45 +++++++++++++++++++++++++++++++++
 metatlas/io/targeted_output.py  | 16 ++++++++----
 metatlas/plots/chromplotplus.py |  2 +-
 metatlas/plots/dill2plots.py    |  5 ++--
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
index 639c0f8d..bc85564e 100644
--- a/metatlas/io/rclone.py
+++ b/metatlas/io/rclone.py
@@ -1,6 +1,7 @@
 """ copy files to Google Drive using rclone """
 
 import configparser
+import json
 import logging
 import subprocess
 
@@ -56,3 +57,47 @@ def copy_to_drive(self, source, drive, dest_path=None):
             raise err
         except FileNotFoundError:
             logger.info("rclone not found. Skipping transfer to Google Drive")
+
+    def parse_path(self, path_string):
+        """
+        Inputs:
+            path_string: a string containing drive_name a colon and one or more folders like:
+                         'my_drive:folder1/folder2'
+        returns a tuple of the drive_name, folder_list
+        """
+        drive = path_string.split(':')[0]
+        remainder = ':'.join(path_string.split(':')[1:])
+        return drive, remainder.split('/')
+
+    def get_id_for_path(self, path_string):
+        """
+        Inputs:
+            path_string: a string containing drive_name a colon and one or more folders like:
+                         'my_drive:folder1/folder2'
+        returns an ID string which can be used in a Google Drive URL
+        """
+        drive, folders = self.parse_path(path_string)
+        assert isinstance(folders, list)
+        assert isinstance(folders[:-1], list)
+        all_but_last = f"{drive}:{'/'.join(folders[:-1])}"
+        command_list = [self.rclone_path, "lsjson", "--dirs-only", all_but_last]
+        try:
+            result = subprocess.check_output(command_list, text=True)
+        except subprocess.CalledProcessError as err:
+            logger.exception(err)
+            raise err
+        returned_folders = json.loads(result)
+        for folder in returned_folders:
+            if folder['Name'] == folders[-1]:
+                return folder['ID']
+        raise FileNotFoundError(f"Could not find a file or folder at {path_string}")
+
+    def path_to_url(self, path_string):
+        """
+        Inputs:
+            path_string: a string containing drive_name a colon and one or more folders like:
+                         'my_drive:folder1/folder2'
+        returns an URL for opening the object at path_string
+        """
+        drive_id = self.get_id_for_path(path_string)
+        return f"https://drive.google.com/drive/folders/{drive_id}"
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 9cf34d0c..82c0e0be 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -10,6 +10,8 @@
 import numpy as np
 import pandas as pd
 
+from IPython.core.display import display, HTML
+
 from metatlas.io import rclone
 from metatlas.io import write_utils
 from metatlas.plots import dill2plots as dp
@@ -98,7 +100,8 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
         share_y: use a common y-axis scaling
         overwrite: if False raise error if file already exists
     """
-    # logging and overwrite checks done within dp.make_chromatograms
+    # overwrite checks done within dp.make_chromatograms
+    logger.info('Exporting chromotograms to %s', metatlas_dataset.ids.output_dir)
     dp.make_chromatograms(
         input_dataset=metatlas_dataset,
         include_lcmsruns=[],
@@ -116,8 +119,9 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
 
 
 def write_identification_figure(metatlas_dataset, overwrite=False):
-    """Save identificatoin figure. Will not overwrite existing file unless overwrite is True"""
-    # logging and overwrite checks done within dp.make_identification_figure_v2
+    """Save identification figure. Will not overwrite existing file unless overwrite is True"""
+    # overwrite checks done within dp.make_identification_figure_v2
+    logger.info('Exporting indentification figures to %s', metatlas_dataset.ids.output_dir)
     dp.make_identification_figure_v2(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
@@ -318,6 +322,8 @@ def copy_outputs_to_google_drive(ids):
     folders = [ids.experiment, ids.analysis, ids.output_type]
     if ids.output_type != 'data_QC':
         folders.append(ids.short_polarity)
-    sub_folder = os.path.join("Analysis_uploads", *folders)
-    rci.copy_to_drive(ids.output_dir, drive, sub_folder)
+    sub_folders_string = os.path.join("Analysis_uploads", *folders)
+    rci.copy_to_drive(ids.output_dir, drive, sub_folders_string)
     logger.info("Done copying output files to Google Drive")
+    path_string = f"{drive}:{sub_folders_string}"
+    display(HTML(f'Data is now on Google Drive at <a href="{rci.path_to_url(path_string)}">{path_string}</a>'))
diff --git a/metatlas/plots/chromplotplus.py b/metatlas/plots/chromplotplus.py
index 927991ea..95230d31 100644
--- a/metatlas/plots/chromplotplus.py
+++ b/metatlas/plots/chromplotplus.py
@@ -233,7 +233,7 @@ def __make_figure(self):
             plt.rcParams['text.usetex'] = False
             pdf.savefig(self.fig)
             plt.close()
-            logger.info("Exported chromatogram to %s.", self.file_name)
+            logger.debug("Exported chromatogram to %s.", self.file_name)
 
     @staticmethod
     def __yield_label():
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 9ff2a2b1..3ae3f30f 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1578,6 +1578,7 @@ def plot_errorbar_plots(df,output_loc='', use_shortnames=True, ylabel=""):
 
 def make_boxplot_plots(df, output_loc='', use_shortnames=True, ylabel="", overwrite=True, max_cpus=1):
     output_loc = os.path.expandvars(output_loc)
+    logger.info('Exporting box plots of %s to %s.', ylabel, output_loc)
     disable_interactive_plots()
     args = [(compound, df, output_loc, use_shortnames, ylabel, overwrite) for compound in df.index]
     parallel.parallel_process(make_boxplot, args, max_cpus, unit='plot')
@@ -1604,7 +1605,7 @@ def make_boxplot(compound, df, output_loc, use_shortnames, ylabel, overwrite):
     write_utils.check_existing_file(fig_path, overwrite)
     f.savefig(fig_path)
     plt.close(f)
-    logger.info('Exported box plot of %s for %s at %s.', ylabel, compound, fig_path)
+    logger.debug('Exported box plot of %s for %s at %s.', ylabel, compound, fig_path)
 
 
 def frag_refs_to_json(json_dir = '/project/projectdirs/metatlas/projects/sharepoint/', name = 'frag_refs', save = True):
@@ -2357,7 +2358,7 @@ def no_axis_plot(i):
         write_utils.check_existing_file(fig_path, overwrite)
         plt.savefig(fig_path)
         plt.close()
-        logger.info('Exported identification figures for %s to %s.', compound_names[compound_idx], fig_path)
+        logger.debug('Exported identification figures for %s to %s.', compound_names[compound_idx], fig_path)
     match_path = os.path.join(output_loc, 'MatchingMZs.tab')
     write_utils.export_dataframe(match, match_path, 'matching MZs', overwrite, sep='\t')
 

From e50f661e56ffa76ea274dd22cfcd7faf13d41c80 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 14 Sep 2021 12:51:21 -0700
Subject: [PATCH 151/177] fix msms_quality truncation in Final Ident.

---
 metatlas/tools/fastanalysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index c32fd2d7..92774f01 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -249,7 +249,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
         final_df.loc[compound_idx, 'ms1_notes'] = cid.ms1_notes
         final_df.loc[compound_idx, 'ms2_notes'] = cid.ms2_notes
         try:
-            final_df.loc[compound_idx, 'msms_quality'] = int(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
+            final_df.loc[compound_idx, 'msms_quality'] = float(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
         except ValueError:
             final_df.loc[compound_idx, 'msms_quality'] = ''
         quality_scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]

From 3b039ab65beaf3b676f4d52ed09b0f2d5c09c71c Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 14 Sep 2021 14:44:05 -0700
Subject: [PATCH 152/177] Add docs on testing rclone configuration

---
 docs/Targeted_Analysis.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/Targeted_Analysis.md b/docs/Targeted_Analysis.md
index 1350f867..8444b51a 100644
--- a/docs/Targeted_Analysis.md
+++ b/docs/Targeted_Analysis.md
@@ -20,6 +20,23 @@ rclone config create metabolomics drive root_folder_id 0B-ZDcHbPi-aqZzE5V3hOZFc0
 scp $(rclone config file | tail -1) dtn01.nersc.gov:~/.config/rclone/rclone.conf
 ```
 
+Now you can verify that rclone is working correctly by running this on Cori:
+```
+/global/cfs/cdirs/m342/USA/shared-repos/rclone/bin/rclone lsd metabolomics:Analysis_uploads
+```
+
+which should yield a listing of metabolomics experiment names similar to:
+
+```
+          -1 2021-08-30 10:01:06        -1 20210323_JGI-AK_SS_504264_GEBA_Pantoea-final_QE-HF_HILICZ_USHXG01602
+          -1 2021-08-30 12:32:39        -1 20210518_JGI-AK_IG-SS_503256_BETO_Pceleri_QE-HF_HILICZ_USHXG01602
+          -1 2021-09-13 16:39:15        -1 20210721_JGI-AK_JB_504782_PseudoOphi_final_QE-139_HILICZ_USHXG01490
+          -1 2021-09-13 17:40:55        -1 20210723_JGI-AK_DB-TM_506963_LemCreek_final_QE-HF_HILICZ_USHXG01494
+          -1 2021-09-13 16:39:15        -1 20210728_JGI-AK_MD_507130_Bioscales_pilot2_QE-139_HILICZ_USHXG01490
+          -1 2021-09-10 16:05:18        -1 20210804_JGI-AK_PA-CT_507784_Frtlzr_Set1_QE-139_HILICZ_USHXG01490
+          -1 2021-09-13 16:34:45        -1 20210819_JGI-AK_MK_506588_SoilWaterRep_final_QE-139_HILICZ_USHXG01490
+```
+
 #### For Windows
 
 1. Download and unzip [rclone-current-windows-amd64.zip](https://downloads.rclone.org/rclone-current-windows-amd64.zip).

From df6902c32fa9fd5ad2c087bc00e37cc692406300 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 14 Sep 2021 21:39:02 -0700
Subject: [PATCH 153/177] store relative file names to pickel files

---
 metatlas/datastructures/metatlas_dataset.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index b8b795a0..6b0bb334 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -555,13 +555,14 @@ def _save_to_cache(self, data: Any, metadata: dict) -> None:
         metadata = metadata.copy()
         name = metadata["_variable_name"]
         base_name = f"{name}_{uuid.uuid4()}"
-        metadata["_pickle_file_name"] = os.path.join(self.ids.cache_dir, f"{base_name}.pkl")
+        metadata["_pickle_file_name"] = f"{base_name}.pkl"  # relative to metadata file
         metadata_file_name = os.path.join(self.ids.cache_dir, f"{base_name}.metadata")
-        with open(metadata["_pickle_file_name"], "wb") as pickle_fh:
+        pickle_path = os.path.join(self.ids.cache_dir, metadata["_pickle_file_name"])
+        with open(pickle_path, "wb") as pickle_fh:
             pickle.dump(data, pickle_fh)
         with open(metadata_file_name, "wb") as metadata_fh:
             pickle.dump(metadata, metadata_fh)
-        logger.info("Caching %s in %s.", name, metadata["_pickle_file_name"])
+        logger.info("Caching %s in %s.", name, pickle_path)
 
     def _query_cache(self, required_metadata: dict) -> Optional[Any]:
         assert "_variable_name" in required_metadata.keys()
@@ -569,7 +570,7 @@ def _query_cache(self, required_metadata: dict) -> Optional[Any]:
         for metadata_file in glob.glob(os.path.join(self.ids.cache_dir, f"{name}_*.metadata")):
             with open(metadata_file, "rb") as metadata_fh:
                 potential_metadata = pickle.load(metadata_fh)
-                pickle_file_name = potential_metadata["_pickle_file_name"]
+                pickle_file_name = os.path.join(self.ids.cache_dir, potential_metadata["_pickle_file_name"])
                 # require_metadata does not have a '_pickle_file_name' key, so remove before equality test
                 del potential_metadata["_pickle_file_name"]
                 if required_metadata == potential_metadata:

From d439b4b27df41b0135aa922259337ed1f0fead7a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 16 Sep 2021 09:52:44 -0700
Subject: [PATCH 154/177] Allow source_atlas to be owned by someone else

---
 metatlas/datastructures/metatlas_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 6b0bb334..0c7e7d04 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -636,7 +636,7 @@ def _get_atlas(self) -> None:
 
     def _clone_source_atlas(self) -> metob.Atlas:
         logger.info("Retriving source atlas: %s", self.ids.source_atlas)
-        source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), self.ids.username)
+        source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), "*")
         source_atlas_df = ma_data.make_atlas_df(source_atlas)
         logger.info("Cloning atlas %s", self.ids.source_atlas)
         return dp.make_atlas_from_spreadsheet(

From 15edd8120849cf2805379fa377e405e8f9a930a4 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 16 Sep 2021 09:55:53 -0700
Subject: [PATCH 155/177] format changes only - blacken

---
 metatlas/io/rclone.py          | 10 +++++-----
 metatlas/io/targeted_output.py | 12 +++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
index bc85564e..4e7a53d0 100644
--- a/metatlas/io/rclone.py
+++ b/metatlas/io/rclone.py
@@ -65,9 +65,9 @@ def parse_path(self, path_string):
                          'my_drive:folder1/folder2'
         returns a tuple of the drive_name, folder_list
         """
-        drive = path_string.split(':')[0]
-        remainder = ':'.join(path_string.split(':')[1:])
-        return drive, remainder.split('/')
+        drive = path_string.split(":")[0]
+        remainder = ":".join(path_string.split(":")[1:])
+        return drive, remainder.split("/")
 
     def get_id_for_path(self, path_string):
         """
@@ -88,8 +88,8 @@ def get_id_for_path(self, path_string):
             raise err
         returned_folders = json.loads(result)
         for folder in returned_folders:
-            if folder['Name'] == folders[-1]:
-                return folder['ID']
+            if folder["Name"] == folders[-1]:
+                return folder["ID"]
         raise FileNotFoundError(f"Could not find a file or folder at {path_string}")
 
     def path_to_url(self, path_string):
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index 82c0e0be..e518e244 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -101,7 +101,7 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
         overwrite: if False raise error if file already exists
     """
     # overwrite checks done within dp.make_chromatograms
-    logger.info('Exporting chromotograms to %s', metatlas_dataset.ids.output_dir)
+    logger.info("Exporting chromotograms to %s", metatlas_dataset.ids.output_dir)
     dp.make_chromatograms(
         input_dataset=metatlas_dataset,
         include_lcmsruns=[],
@@ -121,7 +121,7 @@ def write_chromatograms(metatlas_dataset, group_by="index", share_y=True, overwr
 def write_identification_figure(metatlas_dataset, overwrite=False):
     """Save identification figure. Will not overwrite existing file unless overwrite is True"""
     # overwrite checks done within dp.make_identification_figure_v2
-    logger.info('Exporting indentification figures to %s', metatlas_dataset.ids.output_dir)
+    logger.info("Exporting indentification figures to %s", metatlas_dataset.ids.output_dir)
     dp.make_identification_figure_v2(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
@@ -295,7 +295,7 @@ def archive_outputs(ids):
         ids: an AnalysisIds object
     """
     logger.info("Generating archive of output files.")
-    suffix = "" if ids.output_type == 'data_QC' else f"-{ids.short_polarity}"
+    suffix = "" if ids.output_type == "data_QC" else f"-{ids.short_polarity}"
     output_file = f"{ids.short_experiment_analysis}{suffix}.tar.gz"
     output_path = os.path.join(ids.project_directory, ids.experiment, output_file)
     with tarfile.open(output_path, "w:gz") as tar:
@@ -320,10 +320,12 @@ def copy_outputs_to_google_drive(ids):
         logger.warning("RClone config file missing JGI_Metabolomics_Projects -- %s.", fail_suffix)
         return
     folders = [ids.experiment, ids.analysis, ids.output_type]
-    if ids.output_type != 'data_QC':
+    if ids.output_type != "data_QC":
         folders.append(ids.short_polarity)
     sub_folders_string = os.path.join("Analysis_uploads", *folders)
     rci.copy_to_drive(ids.output_dir, drive, sub_folders_string)
     logger.info("Done copying output files to Google Drive")
     path_string = f"{drive}:{sub_folders_string}"
-    display(HTML(f'Data is now on Google Drive at <a href="{rci.path_to_url(path_string)}">{path_string}</a>'))
+    display(
+        HTML(f'Data is now on Google Drive at <a href="{rci.path_to_url(path_string)}">{path_string}</a>')
+    )

From 94134538b9a391f4ceace5d27584901f232822dc Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 16 Sep 2021 09:58:34 -0700
Subject: [PATCH 156/177] Allow source_atlas to be owned by someone else

---
 metatlas/datastructures/metatlas_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 0c7e7d04..3710797c 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -195,7 +195,7 @@ def _valid_source_atlas(self, proposal: Proposal) -> Optional[AtlasName]:
         if proposal["value"] is not None:
             proposed_name = cast(AtlasName, proposal["value"])
             try:
-                get_atlas(proposed_name, self.username)  # raises error if not found or matches multiple
+                get_atlas(proposed_name, "*")  # raises error if not found or matches multiple
             except ValueError as err:
                 raise TraitError(str(err)) from err
             return proposed_name

From 2f584e90034fa86b4c21b89b97f8d316130692d8 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 16 Sep 2021 17:44:53 -0700
Subject: [PATCH 157/177] update default image for local_jupyter.sh

---
 docker/local_jupyter.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/local_jupyter.sh b/docker/local_jupyter.sh
index eda62d18..df632dfa 100755
--- a/docker/local_jupyter.sh
+++ b/docker/local_jupyter.sh
@@ -5,7 +5,7 @@ set -euf -o pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR=$(dirname "$SCRIPT_DIR")
 OUT_DIR="${SCRIPT_DIR}/out"
-IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.3.0'
+IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.2'
 PORT=8888
 
 while [[ "$#" -gt 0 ]]; do

From 3db20620dd434f13ce55b1768bef31358842c7a9 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 17 Sep 2021 10:45:11 -0700
Subject: [PATCH 158/177] add type annotations to rclone

---
 metatlas/io/rclone.py     | 38 ++++++++++++++++++++------------------
 tests/unit/test_rclone.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 18 deletions(-)
 create mode 100644 tests/unit/test_rclone.py

diff --git a/metatlas/io/rclone.py b/metatlas/io/rclone.py
index 4e7a53d0..de175db6 100644
--- a/metatlas/io/rclone.py
+++ b/metatlas/io/rclone.py
@@ -5,6 +5,7 @@
 import logging
 import subprocess
 
+from typing import List, Optional, Tuple
 
 logger = logging.getLogger(__name__)
 
@@ -12,10 +13,10 @@
 class RClone:
     """Access to Google Drive"""
 
-    def __init__(self, rclone_path):
+    def __init__(self, rclone_path: str) -> None:
         self.rclone_path = rclone_path
 
-    def config_file(self):
+    def config_file(self) -> Optional[str]:
         """Returns path to config file or None"""
         try:
             result = subprocess.check_output([self.rclone_path, "config", "file"], text=True)
@@ -23,7 +24,7 @@ def config_file(self):
             return None
         return result.split("\n")[1]
 
-    def get_name_for_id(self, identifier):
+    def get_name_for_id(self, identifier: str) -> Optional[str]:
         """
         Inputs:
             identifer: unique folder identifier from Google Drive URL
@@ -42,7 +43,7 @@ def get_name_for_id(self, identifier):
                     return name
         return None
 
-    def copy_to_drive(self, source, drive, dest_path=None):
+    def copy_to_drive(self, source: str, drive: str, dest_path: str = None) -> None:
         """
         Inputs:
             source: file or directory to copy to drive
@@ -58,25 +59,14 @@ def copy_to_drive(self, source, drive, dest_path=None):
         except FileNotFoundError:
             logger.info("rclone not found. Skipping transfer to Google Drive")
 
-    def parse_path(self, path_string):
-        """
-        Inputs:
-            path_string: a string containing drive_name a colon and one or more folders like:
-                         'my_drive:folder1/folder2'
-        returns a tuple of the drive_name, folder_list
-        """
-        drive = path_string.split(":")[0]
-        remainder = ":".join(path_string.split(":")[1:])
-        return drive, remainder.split("/")
-
-    def get_id_for_path(self, path_string):
+    def get_id_for_path(self, path_string: str) -> str:
         """
         Inputs:
             path_string: a string containing drive_name a colon and one or more folders like:
                          'my_drive:folder1/folder2'
         returns an ID string which can be used in a Google Drive URL
         """
-        drive, folders = self.parse_path(path_string)
+        drive, folders = parse_path(path_string)
         assert isinstance(folders, list)
         assert isinstance(folders[:-1], list)
         all_but_last = f"{drive}:{'/'.join(folders[:-1])}"
@@ -92,7 +82,7 @@ def get_id_for_path(self, path_string):
                 return folder["ID"]
         raise FileNotFoundError(f"Could not find a file or folder at {path_string}")
 
-    def path_to_url(self, path_string):
+    def path_to_url(self, path_string: str) -> str:
         """
         Inputs:
             path_string: a string containing drive_name a colon and one or more folders like:
@@ -101,3 +91,15 @@ def path_to_url(self, path_string):
         """
         drive_id = self.get_id_for_path(path_string)
         return f"https://drive.google.com/drive/folders/{drive_id}"
+
+
+def parse_path(path_string: str) -> Tuple[str, List[str]]:
+    """
+    Inputs:
+        path_string: a string containing drive_name a colon and one or more folders like:
+                     'my_drive:folder1/folder2'
+    returns a tuple of the drive_name, folder_list
+    """
+    drive = path_string.split(":")[0]
+    remainder = ":".join(path_string.split(":")[1:])
+    return drive, remainder.split("/")
diff --git a/tests/unit/test_rclone.py b/tests/unit/test_rclone.py
new file mode 100644
index 00000000..5274494a
--- /dev/null
+++ b/tests/unit/test_rclone.py
@@ -0,0 +1,34 @@
+""" Tests of RClone """
+# pylint: disable=missing-function-docstring
+
+import os
+import subprocess
+
+import pytest
+
+from metatlas.io import rclone
+
+
+def has_rclone():
+    return os.system("rclone") == 256
+
+
+def rclone_path():
+    result = subprocess.run(["which", "rclone"], stdout=subprocess.PIPE, check=True)
+    return result.stdout.decode("utf-8").rstrip()
+
+
+def test_config_file01():
+    rci = rclone.RClone("/bin/foobarz")
+    assert rci.config_file() is None
+
+
+@pytest.mark.skipif(not has_rclone(), reason="rclone not in PATH")
+def test_config_file02():
+    rci = rclone.RClone(rclone_path())
+    config_path = os.path.join(os.environ["HOME"], ".config", "rclone", "rclone.conf")
+    assert rci.config_file() == config_path
+
+
+def test_parse_path01():
+    assert ("drive", ["foo", "bar"]) == rclone.parse_path("drive:foo/bar")

From 51c2aaec9d0f3845d11abe11a2365489baac3d78 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 17 Sep 2021 10:47:20 -0700
Subject: [PATCH 159/177] add type casts to get_atlas calls

---
 metatlas/datastructures/metatlas_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 3710797c..349b922b 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -195,7 +195,7 @@ def _valid_source_atlas(self, proposal: Proposal) -> Optional[AtlasName]:
         if proposal["value"] is not None:
             proposed_name = cast(AtlasName, proposal["value"])
             try:
-                get_atlas(proposed_name, "*")  # raises error if not found or matches multiple
+                get_atlas(proposed_name, cast(Username, "*"))  # raises error if not found or matches multiple
             except ValueError as err:
                 raise TraitError(str(err)) from err
             return proposed_name
@@ -636,7 +636,7 @@ def _get_atlas(self) -> None:
 
     def _clone_source_atlas(self) -> metob.Atlas:
         logger.info("Retriving source atlas: %s", self.ids.source_atlas)
-        source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), "*")
+        source_atlas = get_atlas(cast(AtlasName, self.ids.source_atlas), cast(Username, "*"))
         source_atlas_df = ma_data.make_atlas_df(source_atlas)
         logger.info("Cloning atlas %s", self.ids.source_atlas)
         return dp.make_atlas_from_spreadsheet(

From e355d95b821122487255f6b0b815d29fce0fe158 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 17 Sep 2021 10:57:32 -0700
Subject: [PATCH 160/177] correct ms1_rt_peak in Final IDs

---
 metatlas/datastructures/metatlas_dataset.py | 5 +++++
 metatlas/tools/fastanalysis.py              | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 349b922b..3796b4fa 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -543,6 +543,7 @@ def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
         logger.debug("Creating new MetatlasDataset instance...")
         self._hits_valid_for_rt_bounds = False  # based only on RT min/max changes
+        self._data_valid_for_rt_bounds = False  # based only on RT min/max changes
         if self.ids.source_atlas is not None:
             self._get_atlas()
         if self.save_metadata:
@@ -832,6 +833,7 @@ def data(self) -> Tuple[MetatlasSample, ...]:
         """data getter, update ._data if necessary"""
         if self._data is None:
             self._build()
+            self._data_valid_for_rt_bounds = True
         return cast(Tuple[MetatlasSample, ...], self._data)
 
     @property
@@ -982,6 +984,7 @@ def set_rt(self, compound_idx: int, which: str, time: float) -> None:
         metob.store(atlas_rt_ref)
         if which in ["rt_min", "rt_max"]:
             self._hits_valid_for_rt_bounds = False
+            self._data_valid_for_rt_bounds = False
 
     def set_note(self, compound_idx: int, which: str, value: str) -> None:
         """
@@ -1074,6 +1077,8 @@ def generate_all_outputs(self, msms_fragment_ions: bool = False, overwrite: bool
         """
         if not self._hits_valid_for_rt_bounds:
             self._hits = None  # force hits to be regenerated
+        if not self._data_valid_for_rt_bounds:
+            self._data = None  # force data to be regenerated
         self.extra_time = 0.5
         logger.info("extra_time set to 0.5 minutes for output generation.")
         logger.info("Removing InjBl from exclude_groups.")
diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index 92774f01..ab56747d 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -270,7 +270,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
             final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[intensities['intensity'].idxmax()]['intensity']
             max_intensity_file_id = int(intensities.loc[intensities['intensity'].idxmax()]['file_id'])
             final_df.loc[compound_idx, 'max_intensity_file'] = file_names[max_intensity_file_id]
-            final_df.loc[compound_idx, 'ms1_rt_peak'] = dataset[max_intensity_file_id][compound_idx]['identification'].rt_references[0].rt_peak
+            final_df.loc[compound_idx, 'ms1_rt_peak'] = dataset[max_intensity_file_id][compound_idx]['data']['ms1_summary']['rt_peak']
         else:
             final_df.loc[compound_idx, 'max_intensity'] = ""
             final_df.loc[compound_idx, 'max_intensity_file'] = ""

From 51992a0c2855a2d0f65e81b92d66a6a0c5c9bc8d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 17 Sep 2021 11:37:16 -0700
Subject: [PATCH 161/177] update RT predict test for split output dirs

---
 tests/system/test_rt_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index 61f9ca57..d680f7fc 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -184,5 +184,5 @@ def test_rt_predict_by_line01(tmp_path):
                         /out/Remove-done.ipynb
                    """
     utils.exec_docker(image, command, tmp_path)
-    assert utils.num_files_in(tmp_path) == 45
+    assert utils.num_files_in(tmp_path) == 49
     utils.assert_files_match(expected)

From e203d11fb00d0788eb33f309452fe45eaaf5fa68 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 17 Sep 2021 11:44:18 -0700
Subject: [PATCH 162/177] Start of msms_refs validation

---
 docker/Dockerfile.msms_refs     |  40 ++++
 metatlas/tools/add_msms_ref.py  | 388 ++++++++++++++++++++++++++++++++
 noxfile.py                      |   1 +
 pyproject.toml                  |   5 +
 tests/unit/test_add_msms_ref.py |  50 ++++
 5 files changed, 484 insertions(+)
 create mode 100644 docker/Dockerfile.msms_refs
 create mode 100644 metatlas/tools/add_msms_ref.py
 create mode 100644 tests/unit/test_add_msms_ref.py

diff --git a/docker/Dockerfile.msms_refs b/docker/Dockerfile.msms_refs
new file mode 100644
index 00000000..af3bf6a6
--- /dev/null
+++ b/docker/Dockerfile.msms_refs
@@ -0,0 +1,40 @@
+FROM python:3.8-slim-bullseye
+
+# https://portal.nersc.gov/cfs/m2650/metatlas/test_data
+# serves from /global/cfs/cdirs/m2650/www/metatlas/test_data
+ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci01
+ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
+
+ENV METATLAS_LOCAL=True
+
+EXPOSE 8888
+
+RUN apt-get update && \
+    apt-get install -y \
+        libxrender1 \
+	nodejs \
+	npm && \
+    rm -rf /var/lib/apt/lists/*
+
+
+COPY requirements.txt /requirements.txt
+
+RUN pip install --quiet -r requirements.txt && \
+    pip install qgrid && \
+    jupyter nbextension enable --py --sys-prefix qgrid && \
+    jupyter nbextension enable --py --sys-prefix widgetsnbextension && \
+    jupyter labextension install @jupyter-widgets/jupyterlab-manager
+
+
+RUN mkdir -p /io /src /work $REFS_DIR
+
+ADD $BASE_DATA_URL/msms_refs_v3.tab $REFS_DIR/
+
+ADD $BASE_DATA_URL/meta_atlas.sqlite3 /work/root_workspace.db
+
+RUN mkdir -p /root/.local/share/jupyter/kernels/metatlas-targeted
+COPY kernel.json /root/.local/share/jupyter/kernels/metatlas-targeted/kernel.json
+
+WORKDIR /work
+
+CMD ["/usr/local/bin/jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
new file mode 100644
index 00000000..025e6b97
--- /dev/null
+++ b/metatlas/tools/add_msms_ref.py
@@ -0,0 +1,388 @@
+""" For minipulating msms_refs files """
+import logging
+import math
+import uuid
+
+from typing import cast, Optional, List, TypedDict
+
+# os.chdir("/work")
+# sys.path.insert(0, "/src")
+# os.environ["METATLAS_LOCAL"] = "TRUE"
+
+import numpy as np
+import pandas as pd
+import traitlets
+
+from traitlets import TraitError, default, validate
+from traitlets import Float, HasTraits, Instance, Int, TraitType, Unicode
+
+from pandas.api.types import CategoricalDtype
+from rdkit import Chem
+
+# from metatlas.tools import environment
+from metatlas.datastructures import metatlas_objects as metob
+
+# from metatlas.tools import notebook  # noqa: E402
+
+# notebook.setup("INFO")
+
+REFS_V3_FILE_NAME = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+
+POLARITIES = ["negative", "positive"]
+
+INSTRUMENT_TYPES = [
+    "ESI-ITFT",
+    "ESI-ITTOF",
+    "ESI-QTOF",
+    "LC-ESI-IT",
+    "LC-ESI-ITFT",
+    "LC-ESI-ITTOF",
+    "LC-ESI-QFT",
+    "LC-ESI-QIT",
+    "LC-ESI-QQ",
+    "LC-ESI-QTOF",
+    "Orbitrap",
+]
+
+FRAG_METHODS = [
+    "CID",
+    "EBEQ",
+    "FT-ICR/FTMS",
+    "FT-ICR/Fourier transform ion cyclotron resonance",
+    "HCD",
+    "IT-FT/ion trap with FTMS",
+    "IT/ion trap",
+    "LOW-ENERGY CID",
+    "Q-TOF",
+    "QqIT",
+    "QqLIT",
+    "QqQ",
+    "cid",
+]
+
+polarity_type = CategoricalDtype(categories=POLARITIES, ordered=True)
+frag_method_type = CategoricalDtype(categories=FRAG_METHODS, ordered=False)
+instrument_type_type = CategoricalDtype(categories=INSTRUMENT_TYPES, ordered=False)
+
+
+def is_inchi(test_inchi: str) -> bool:
+    """True if input can be parsed as an inchi string"""
+    return Chem.inchi.MolFromInchi(test_inchi) is not None
+
+
+def is_valid_inchi_pair(test_inchi: str, test_inchi_key: str) -> bool:
+    """True if if test_inchi has the inchi key test_inchi_key"""
+    if not is_inchi(test_inchi):
+        return False
+    return test_inchi_key == Chem.inchi.InchiToInchiKey(test_inchi)
+
+
+def is_valid_inchi_smiles_pair(test_inchi: str, test_smiles: str) -> bool:
+    """
+    True if test_inchi and test_smiles have the same structure.
+    Also True if test_smiles is None and test_inchi is a valid inchi
+    """
+    if pd.isna(test_smiles):
+        return is_inchi(test_inchi)
+    mol_from_inchi = Chem.inchi.MolFromInchi(test_inchi)
+    mol_from_smiles = Chem.MolFromSmiles(test_smiles)
+    return are_equal(mol_from_inchi, mol_from_smiles)
+
+
+def get_compound(inchi_key: str) -> Optional[metob.Compound]:
+    """
+    Returns first compound from database matching inchi_key and with username pasteur
+    or None if not found
+    """
+    try:
+        return metob.retrieve("Compounds", inchi_key=inchi_key, username="pasteur")[0]
+    except IndexError:
+        return None
+
+
+def are_equal(molecule1: Chem.rdchem.Mol, molecule2: Chem.rdchem.Mol) -> bool:
+    """True if both molecules are substructures of each other"""
+    return molecule1.HasSubstructMatch(molecule2) and molecule2.HasSubstructMatch(molecule1)
+
+
+def is_synonym(name: str, synonym_string: str) -> bool:
+    """
+    Inputs:
+        name: string to check for within synonym_string
+        synonym_string: string with /// between names
+    Returns True if case insensitive match of name to full name in synonym_string
+    """
+    return name.lower() in [x.lower() for x in synonym_string.split("///")]
+
+
+class Proposal(TypedDict):
+    """for use with traitlets.validate"""
+
+    owner: HasTraits
+    value: object
+    trait: TraitType
+
+
+class Spectrum(HasTraits):
+    """List of intensities with list of corresponding MZ values"""
+
+    intensities: List[float] = traitlets.List(trait=Float())
+    mzs: List[float] = traitlets.List(trait=Float())
+
+    @validate("intensities")
+    def _valid_intensities(self, proposal: Proposal) -> List[float]:
+        """validate positive values, not empty, and same length as mzs list"""
+        value = cast(List[float], proposal["value"])
+        if len(value) == 0:
+            raise TraitError("length of intensities must be greater than 0")
+        if len(value) != len(self.mzs):
+            raise TraitError("length of intensities and mzs must be equal")
+        if any(x <= 0 for x in value):
+            raise TraitError("intensities must be positive")
+        return value
+
+    @validate("mzs")
+    def _valid_mzs(self, proposal: Proposal) -> List[float]:
+        """validate positive values, not empty, and same length as intensities list"""
+        value = cast(List[float], proposal["value"])
+        if len(value) == 0:
+            raise TraitError("length of mzs must be greater than 0")
+        if len(value) != len(self.intensities):
+            raise TraitError("length of intensities and mzs must be equal")
+        if value != sorted(value):
+            raise TraitError("mzs values must be monotonically increasing")
+        if any(x <= 0 for x in value):
+            raise TraitError("mzs values must be positive")
+        return value
+
+
+class MsmsRef(HasTraits):
+    # pylint: disable=too-few-public-methods,too-many-instance-attributes
+    """one line from msms_refs file"""
+    database: str = Unicode()
+    id: str = Unicode(default=uuid.uuid4())
+    name: str = Unicode()
+    spectrum: Spectrum = Instance(klass=Spectrum)
+    decimal: np.ushort = Int(default_value=4)
+    precursor_mz: np.float64 = Float()
+    polarity: str = Unicode()
+    adduct: str = Unicode()
+    fragmentation_method: str = Unicode()
+    collision_energy: str = Unicode()
+    instrument: str = Unicode()
+    instrument_type: str = Unicode()
+    formula: str = Unicode()
+    exact_mass: np.float64 = Float()
+    inchi_key: str = Unicode()
+    inchi: str = Unicode()
+    smiles: str = Unicode()
+
+    # pylint: disable=no-self-use,too-many-arguments
+    def __init__(
+        self,
+        name: str,
+        spectrum: Spectrum,
+        precursor_mz: np.float64,
+        polarity: str,
+        adduct: str,
+        fragmentation_method: str,
+        collision_energy: str,
+        instrument: str,
+        instrument_type: str,
+        formula: str,
+        exact_mass: np.float64,
+        inchi_key: str,
+        **kwargs,
+    ) -> None:
+        """required fields are inputs"""
+        with self.hold_trait_notifications():
+            super().__init__(**kwargs)
+            self.name = name
+            self.spectrum = spectrum
+            self.precursor_mz = precursor_mz
+            self.polarity = polarity
+            self.adduct = adduct
+            self.fragmentation_method = fragmentation_method
+            self.collision_energy = collision_energy
+            self.instrument = instrument
+            self.instrument_type = instrument_type
+            self.formula = formula
+            self.exact_mass = exact_mass
+            self.inchi_key = inchi_key
+        if self.is_bad():
+            raise ValueError("MSMS Ref does not pass validation")
+
+    def _valid_enum(self, proposal, name, values_list):
+        """generic validation for enumerated type"""
+        if proposal["value"] not in values_list:
+            raise TraitError(f"{name} must be one of {', '.join(values_list)}")
+        return proposal["value"]
+
+    def _valid_not_len_zero(self, proposal, name):
+        """generic validation for length greater than 0"""
+        if len(proposal["value"]) == 0:
+            raise TraitError(f"{name} cannot have a length of zero")
+        return proposal["value"]
+
+    def _valid_positive(self, proposal, name):
+        """generic validation for positive value"""
+        if proposal["value"] < 0:
+            raise TraitError(f"{name} must be positive")
+        return proposal["value"]
+
+    @validate("database")
+    def _valid_database(self, proposal):
+        """valid if database string has positive length"""
+        return self._valid_not_len_zero(proposal, "database")
+
+    @validate("id")
+    def _valid_id(self, proposal):
+        """valid if id string has positive length"""
+        return self._valid_not_len_zero(proposal, "id")
+
+    @validate("name")
+    def _valid_name(self, proposal):
+        """valid if name string has positive length"""
+        return self._valid_not_len_zero(proposal, "name")
+
+    @validate("decimal")
+    def _valid_decimal(self, proposal):
+        """valid if decimal is positive"""
+        return self._valid_positive(proposal, "decimal")
+
+    @validate("precursor_mz")
+    def _valid_precursor_mz(self, proposal):
+        """valid if precursor_mz is positive"""
+        return self._valid_positive(proposal, "precursor_mz")
+
+    @validate("polarity")
+    def _valid_polarity(self, proposal):
+        """valid if polarity is in POLARITIES"""
+        return self._valid_enum(proposal, "polarity", POLARITIES)
+
+    @validate("adduct")
+    def _valid_adduct(self, proposal):
+        """valid if adduct string has positive length"""
+        return self._valid_not_len_zero(proposal, "adduct")
+
+    @validate("fragmentation_method")
+    def _valid_fragmentation_method(self, proposal):
+        """valid if fragmentation_method in FRAG_METHODS"""
+        return self._valid_enum(proposal, "fragmentation_method", FRAG_METHODS)
+
+    @validate("collision_energy")
+    def _valid_collision_energy(self, proposal):
+        """valid if collision_energy has positive length"""
+        return self._valid_not_len_zero(proposal, "collision_energy")
+
+    @validate("instrument")
+    def _valid_instrument(self, proposal):
+        """valid if instrument has positive length"""
+        return self._valid_not_len_zero(proposal, "instrument")
+
+    @validate("instrument_type")
+    def _valid_instrument_type(self, proposal):
+        """valid if instrument_type is in INSTRUMENT_TYPES"""
+        return self._valid_enum(proposal, "instrument_type", INSTRUMENT_TYPES)
+
+    @validate("formula")
+    def _valid_formula(self, proposal):
+        """valid if formula has positive length"""
+        return self._valid_not_len_zero(proposal, "formula")
+
+    @validate("exact_mass")
+    def _valid_exact_mass(self, proposal):
+        """valid if exact_mass is positive"""
+        return self._valid_positive(proposal, "exact_mass")
+
+    @validate("inchi_key")
+    def _valid_inchi_key(self, proposal):
+        """valid if inchi_key has positive length"""
+        return self._valid_not_len_zero(proposal, "inchi_key")
+
+    @validate("inchi")
+    def _valid_inchi(self, proposal):
+        """valid if inchi matches with inchi_key"""
+        if not is_inchi(proposal["value"]):
+            raise TraitError("not valid inchi")
+        if not is_valid_inchi_pair(proposal["value"], self.inchi_key):
+            raise TraitError("inchi and inchi_key do not represent the same molecule")
+        return proposal["value"]
+
+    @validate("smiles")
+    def _valid_smiles(self, proposal):
+        """valid if smiles matches with inchi"""
+        if not is_valid_inchi_smiles_pair(self.inchi, proposal["value"]):
+            raise TraitError("inchi and smiles do not represent the same molecule")
+        return proposal["value"]
+
+    @default("smiles")
+    def _get_default_smiles(self):
+        """generate smiles from inchi"""
+        return Chem.MolToSmiles(Chem.inchi.MolFromInchi(self.inchi))
+
+    def is_bad(self):
+        """
+        If returns True, then the inputs are bad, but if returns False do not assume the inputs are good
+        returning False only means that there is no evidence the inputs are bad. Conclusively saying
+        the inputs are good for unusual chemicals that are not in databases is hard.
+        """
+        # pylint: disable=too-many-return-statements
+        if self.fragmentation_method not in FRAG_METHODS:
+            logging.error('Invalid fragmentation method "%s" for %s.', self.fragmentation_method, self.name)
+            return True
+        if not is_valid_inchi_pair(self.inchi, self.inchi_key):
+            logging.error("Invalid inchi/inchi_key pair for %s.", self.name)
+            return True
+        results = metob.retrieve("compounds", username="*", inchi_key=self.inchi_key)
+        if len(results) == 0:
+            return False
+        ref_compound = results[0]
+        if self.formula != ref_compound.formula:
+            logging.error(
+                'Formula "%s" for %s does not match value "%s" in database.',
+                self.formula,
+                self.name,
+                ref_compound.formula,
+            )
+            return True
+        if not math.isclose(self.exact_mass, ref_compound.mono_isotopic_molecular_weight, rel_tol=1e-9):
+            logging.error(
+                "Exact mass %s for %s does not match value %s in database.",
+                self.exact_mass,
+                self.name,
+                ref_compound.mono_isotopic_molecular_weight,
+            )
+            return True
+        if not is_synonym(self.name, ref_compound.synonyms):
+            logging.error("Inchi_key %s does not contain name %s in database.", self.inchi_key, self.name)
+            return True
+        return False
+
+
+def read_msms_refs(file_name: str, sep="\t", **kwargs) -> pd.DataFrame:
+    """Read in msms refs from file with correct types"""
+    return pd.read_csv(
+        file_name,
+        sep=sep,
+        dtype={
+            "database": "string",
+            "id": "string",
+            "name": "string",
+            "spectrum": "string",
+            "decimal": np.ushort,
+            "precursor_mz": np.float64,
+            "polarity": polarity_type,
+            "adduct": "string",
+            "fragmentation_method": frag_method_type,
+            "collision_energy": "string",
+            "instrument": "string",
+            "instrument_type": instrument_type_type,
+            "formula": "string",
+            "exact_mass": np.float64,
+            "inchi_key": "string",
+            "inchi": "string",
+            "smiles": "string",
+        },
+        **kwargs,
+    )
diff --git a/noxfile.py b/noxfile.py
index 235530d3..a4985478 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -25,6 +25,7 @@
     "metatlas/io/rclone.py",
     "metatlas/io/write_utils.py",
     "metatlas/datastructures/metatlas_dataset.py",
+    "metatlas/tools/add_msms_ref.py",
     "metatlas/tools/environment.py",
     "metatlas/tools/logging.py",
     "metatlas/tools/notebook.py",
diff --git a/pyproject.toml b/pyproject.toml
index c2949d43..3296aa8c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,11 @@ module = [
 ]
 ignore_missing_imports = true
 
+[tool.pylint.MASTER]
+extension-pkg-allow-list = [
+    "rdkit"
+]
+
 [tool.pylint.messages_control]
 # first two are for black compatibility
 # duplicate-code cannot be disabled on per file/block/line
diff --git a/tests/unit/test_add_msms_ref.py b/tests/unit/test_add_msms_ref.py
new file mode 100644
index 00000000..d18e05a3
--- /dev/null
+++ b/tests/unit/test_add_msms_ref.py
@@ -0,0 +1,50 @@
+""" unit tests for add_msms_refs module """
+# pylint: disable=missing-function-docstring,line-too-long
+
+import pytest
+import traitlets
+
+from metatlas.tools import add_msms_ref
+
+
+def tests_msms_ref01(mocker, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    add_msms_ref.MsmsRef(
+        database="my_db",
+        name="2'-deoxyadenosine",
+        spectrum=add_msms_ref.Spectrum(intensities=[1, 1.4, 2], mzs=[100, 101, 555]),
+        decimal=4,
+        precursor_mz=251.101839276,
+        polarity="negative",
+        adduct="[M-H]+",
+        fragmentation_method="cid",
+        collision_energy="60eV",
+        instrument="ThermoTOF-3000",
+        instrument_type="LC-ESI-QTOF",
+        formula="C10H13N5O3",
+        exact_mass=251.101839276,
+        inchi_key="OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
+        inchi="InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+    )
+
+
+def tests_msms_ref02(mocker, compound):
+    mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.MsmsRef(
+            database="my_db",
+            name="2'-deoxyadenosine",
+            spectrum=add_msms_ref.Spectrum(intensities=[1, 1.4, 2], mzs=[100, 101, 555]),
+            decimal=4,
+            precursor_mz=251.101839276,
+            polarity="negative",
+            adduct="[M-H]+",
+            fragmentation_method="cid",
+            collision_energy="60eV",
+            instrument="ThermoTOF-3000",
+            instrument_type="LC-ESI-QTOF",
+            formula="C10H13N5O3",
+            exact_mass=251.101839276,
+            inchi_key="xxx",
+            inchi="InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+        )

From 8a16200764a74d6d47f604ed8bffaf1537b70cfb Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 20 Sep 2021 08:36:43 -0700
Subject: [PATCH 163/177] clean up and fix logging in add_msms_ref

---
 metatlas/tools/add_msms_ref.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
index 025e6b97..6cc837b3 100644
--- a/metatlas/tools/add_msms_ref.py
+++ b/metatlas/tools/add_msms_ref.py
@@ -1,14 +1,11 @@
 """ For minipulating msms_refs files """
+import json
 import logging
 import math
 import uuid
 
 from typing import cast, Optional, List, TypedDict
 
-# os.chdir("/work")
-# sys.path.insert(0, "/src")
-# os.environ["METATLAS_LOCAL"] = "TRUE"
-
 import numpy as np
 import pandas as pd
 import traitlets
@@ -22,9 +19,7 @@
 # from metatlas.tools import environment
 from metatlas.datastructures import metatlas_objects as metob
 
-# from metatlas.tools import notebook  # noqa: E402
-
-# notebook.setup("INFO")
+logger = logging.getLogger(__name__)
 
 REFS_V3_FILE_NAME = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
 
@@ -125,7 +120,6 @@ class Proposal(TypedDict):
 
 class Spectrum(HasTraits):
     """List of intensities with list of corresponding MZ values"""
-
     intensities: List[float] = traitlets.List(trait=Float())
     mzs: List[float] = traitlets.List(trait=Float())
 
@@ -156,6 +150,11 @@ def _valid_mzs(self, proposal: Proposal) -> List[float]:
         return value
 
 
+def str_to_spectrum(spectrum_str: str) -> Spectrum:
+    x = json.loads(str)
+    return Spectrum(msz=x[0], intensities=x[1])
+
+
 class MsmsRef(HasTraits):
     # pylint: disable=too-few-public-methods,too-many-instance-attributes
     """one line from msms_refs file"""
@@ -329,17 +328,17 @@ def is_bad(self):
         """
         # pylint: disable=too-many-return-statements
         if self.fragmentation_method not in FRAG_METHODS:
-            logging.error('Invalid fragmentation method "%s" for %s.', self.fragmentation_method, self.name)
+            logger.error('Invalid fragmentation method "%s" for %s.', self.fragmentation_method, self.name)
             return True
         if not is_valid_inchi_pair(self.inchi, self.inchi_key):
-            logging.error("Invalid inchi/inchi_key pair for %s.", self.name)
+            logger.error("Invalid inchi/inchi_key pair for %s.", self.name)
             return True
         results = metob.retrieve("compounds", username="*", inchi_key=self.inchi_key)
         if len(results) == 0:
             return False
         ref_compound = results[0]
         if self.formula != ref_compound.formula:
-            logging.error(
+            logger.error(
                 'Formula "%s" for %s does not match value "%s" in database.',
                 self.formula,
                 self.name,
@@ -347,7 +346,7 @@ def is_bad(self):
             )
             return True
         if not math.isclose(self.exact_mass, ref_compound.mono_isotopic_molecular_weight, rel_tol=1e-9):
-            logging.error(
+            logger.error(
                 "Exact mass %s for %s does not match value %s in database.",
                 self.exact_mass,
                 self.name,
@@ -355,7 +354,7 @@ def is_bad(self):
             )
             return True
         if not is_synonym(self.name, ref_compound.synonyms):
-            logging.error("Inchi_key %s does not contain name %s in database.", self.inchi_key, self.name)
+            logger.error("Inchi_key %s does not contain name %s in database.", self.inchi_key, self.name)
             return True
         return False
 

From b309b346b66d0f8915f1208191d7f4da8553793f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Mon, 20 Sep 2021 16:23:45 -0700
Subject: [PATCH 164/177] Add docker image and notebook for adding MSMS refs

---
 docker/Dockerfile.msms_refs                  |   7 +-
 docker/requirements.txt                      |   1 +
 notebooks/reference/Add_MSMS_Reference.ipynb | 129 +++++++++++++++++++
 noxfile.py                                   |   1 +
 4 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 notebooks/reference/Add_MSMS_Reference.ipynb

diff --git a/docker/Dockerfile.msms_refs b/docker/Dockerfile.msms_refs
index af3bf6a6..81bd3f5b 100644
--- a/docker/Dockerfile.msms_refs
+++ b/docker/Dockerfile.msms_refs
@@ -19,12 +19,7 @@ RUN apt-get update && \
 
 COPY requirements.txt /requirements.txt
 
-RUN pip install --quiet -r requirements.txt && \
-    pip install qgrid && \
-    jupyter nbextension enable --py --sys-prefix qgrid && \
-    jupyter nbextension enable --py --sys-prefix widgetsnbextension && \
-    jupyter labextension install @jupyter-widgets/jupyterlab-manager
-
+RUN pip install --quiet -r requirements.txt
 
 RUN mkdir -p /io /src /work $REFS_DIR
 
diff --git a/docker/requirements.txt b/docker/requirements.txt
index 88188325..a22e324a 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -6,6 +6,7 @@ gspread==3.7.0
 h5py==3.2.1
 humanize==3.5.0
 ipympl==0.7.0
+ipysheet==0.5.0
 ipywidgets==7.6.3
 jupyterlab==3.0.16
 matplotlib==3.4.2
diff --git a/notebooks/reference/Add_MSMS_Reference.ipynb b/notebooks/reference/Add_MSMS_Reference.ipynb
new file mode 100644
index 00000000..8c26f415
--- /dev/null
+++ b/notebooks/reference/Add_MSMS_Reference.ipynb
@@ -0,0 +1,129 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d0440386-4864-4a26-a70d-910b4c1e99d4",
+   "metadata": {},
+   "source": [
+    "# Parameters\n",
+    "\n",
+    "The next code block sets parameters that are used throughout the remainder of the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a07e86e-ff53-4cf6-8f61-fad398f9441c",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# pylint: disable=invalid-name,missing-module-docstring\n",
+    "\n",
+    "# Full path to the directory where you have cloned the metatlas git repo.\n",
+    "# If you ran the 'git clone ...' command in your home directory on Cori,\n",
+    "# then you'll want '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'\n",
+    "# where the uppercase letters are replaced based on your NERSC username.\n",
+    "metatlas_repo_path = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas\"\n",
+    "\n",
+    "# absolute path of the input file that contains the msms refs to want to append to\n",
+    "input_file_name = \"/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab\"\n",
+    "\n",
+    "# should the data validation be run on the input file?\n",
+    "# if True, you won't be able to export a file if input_file_name points to a file with any bad rows\n",
+    "validate_input_file = False\n",
+    "\n",
+    "# absolute path of where you want this notebook to store the new MSMS refs\n",
+    "output_file_name = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects/my_msms_refs.tab\"\n",
+    "\n",
+    "# maximum number of CPUs to use\n",
+    "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
+    "max_cpus = 4\n",
+    "\n",
+    "# Threshold for how much status information metatlas functions print in the notebook\n",
+    "# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'\n",
+    "log_level = \"INFO\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2106bcab-ecfa-47cb-91e3-682ad57bd4ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b56b1e89-bd9c-4c11-896f-576c14a99c20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pylint: disable=wrong-import-position,import-error\n",
+    "import logging  # noqa: E402\n",
+    "import os  # noqa: E402\n",
+    "import sys  # noqa: E402\n",
+    "\n",
+    "sys.path.insert(0, metatlas_repo_path)\n",
+    "logger = logging.getLogger(\"metatlas.jupyter\")\n",
+    "logger.debug(\"sys.executable=%s\", sys.executable)\n",
+    "logger.debug(\"sys.path=%s.\", sys.path)\n",
+    "logger.debug(\"metatlas_repo_path=%s.\", metatlas_repo_path)\n",
+    "if not os.path.exists(metatlas_repo_path):\n",
+    "    logging.critical(\n",
+    "        \"Directory set for metatlas_repo_path parameter (%s) does not exist or is not accessible.\",\n",
+    "        metatlas_repo_path,\n",
+    "    )\n",
+    "    raise ValueError(\"Invalid metatlas_repo_path parameter in Jupyter Notebook.\")\n",
+    "try:\n",
+    "    from metatlas.tools import environment  # noqa: E402\n",
+    "\n",
+    "    environment.validate_kernel()\n",
+    "except ModuleNotFoundError as err:\n",
+    "    if str(err) == \"No module named 'metatlas.tools'\":\n",
+    "        logging.critical(\n",
+    "            (\"Could not find metatlas module at %s. \" \"In the Parameters block, please check the value of metatlas_repo_path.\"),\n",
+    "            metatlas_repo_path,\n",
+    "        )\n",
+    "    else:\n",
+    "        logger.critical('Please check that the kernel is set to \"Metatlas Targeted\".')\n",
+    "    raise ModuleNotFoundError from err\n",
+    "except ImportError as err:\n",
+    "    logging.critical(\"A newer version of metatlas_repo is required to use this notebook.\")\n",
+    "    raise ImportError from err\n",
+    "from metatlas.tools import notebook  # noqa: E402\n",
+    "from metatlas.tools import add_msms_ref as amr  # noqa: E402\n",
+    "\n",
+    "notebook.setup(log_level)\n",
+    "amr.display_refs_edit_ui(input_file_name, output_file_name, validate_input_file=validate_input_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Metatlas Targeted",
+   "language": "python",
+   "name": "metatlas-targeted"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.11"
+  },
+  "toc-autonumbering": true,
+  "toc-showtags": false
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/noxfile.py b/noxfile.py
index a4985478..cbcb8daa 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -40,6 +40,7 @@
 notebooks = [
     "notebooks/reference/Targeted.ipynb",
     "notebooks/reference/RT_Prediction.ipynb",
+    "notebooks/reference/Add_MSMS_Reference.ipynb",
 ]
 
 pytest_deps = [

From a8111d3ca4d874cb29e6b97b4a164e078363ad8f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 21 Sep 2021 07:48:13 -0700
Subject: [PATCH 165/177] And edit GUI and tests for add_msms_ref

---
 metatlas/tools/add_msms_ref.py  | 123 ++++++++++++++++++++++++--------
 pyproject.toml                  |   1 +
 tests/unit/test_add_msms_ref.py |  83 ++++++++++++++++++++-
 3 files changed, 174 insertions(+), 33 deletions(-)

diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
index 6cc837b3..dd8703b9 100644
--- a/metatlas/tools/add_msms_ref.py
+++ b/metatlas/tools/add_msms_ref.py
@@ -6,17 +6,18 @@
 
 from typing import cast, Optional, List, TypedDict
 
+import ipysheet
+import ipywidgets as widgets
 import numpy as np
 import pandas as pd
 import traitlets
 
-from traitlets import TraitError, default, validate
-from traitlets import Float, HasTraits, Instance, Int, TraitType, Unicode
-
+from ipysheet import sheet, row, to_dataframe
 from pandas.api.types import CategoricalDtype
 from rdkit import Chem
+from traitlets import TraitError, default, validate
+from traitlets import Float, HasTraits, Instance, Int, TraitType, Unicode
 
-# from metatlas.tools import environment
 from metatlas.datastructures import metatlas_objects as metob
 
 logger = logging.getLogger(__name__)
@@ -59,6 +60,26 @@
 frag_method_type = CategoricalDtype(categories=FRAG_METHODS, ordered=False)
 instrument_type_type = CategoricalDtype(categories=INSTRUMENT_TYPES, ordered=False)
 
+REFS_TYPES = {
+    "database": "string",
+    "id": "string",
+    "name": "string",
+    "spectrum": "string",
+    "decimal": np.ushort,
+    "precursor_mz": np.float64,
+    "polarity": polarity_type,
+    "adduct": "string",
+    "fragmentation_method": frag_method_type,
+    "collision_energy": "string",
+    "instrument": "string",
+    "instrument_type": instrument_type_type,
+    "formula": "string",
+    "exact_mass": np.float64,
+    "inchi_key": "string",
+    "inchi": "string",
+    "smiles": "string",
+}
+
 
 def is_inchi(test_inchi: str) -> bool:
     """True if input can be parsed as an inchi string"""
@@ -81,6 +102,8 @@ def is_valid_inchi_smiles_pair(test_inchi: str, test_smiles: str) -> bool:
         return is_inchi(test_inchi)
     mol_from_inchi = Chem.inchi.MolFromInchi(test_inchi)
     mol_from_smiles = Chem.MolFromSmiles(test_smiles)
+    if mol_from_inchi is None or mol_from_smiles is None:
+        return False
     return are_equal(mol_from_inchi, mol_from_smiles)
 
 
@@ -119,10 +142,18 @@ class Proposal(TypedDict):
 
 
 class Spectrum(HasTraits):
+    # pylint: disable=too-few-public-methods
     """List of intensities with list of corresponding MZ values"""
     intensities: List[float] = traitlets.List(trait=Float())
     mzs: List[float] = traitlets.List(trait=Float())
 
+    def __init__(self, mzs: List[float], intensities: List[float], **kwargs) -> None:
+        """required fields are inputs"""
+        with self.hold_trait_notifications():
+            super().__init__(**kwargs)
+            self.intensities = intensities
+            self.mzs = mzs
+
     @validate("intensities")
     def _valid_intensities(self, proposal: Proposal) -> List[float]:
         """validate positive values, not empty, and same length as mzs list"""
@@ -151,8 +182,9 @@ def _valid_mzs(self, proposal: Proposal) -> List[float]:
 
 
 def str_to_spectrum(spectrum_str: str) -> Spectrum:
-    x = json.loads(str)
-    return Spectrum(msz=x[0], intensities=x[1])
+    """Converts a spectrum string into a Spectrum class instance"""
+    decoded = json.loads(spectrum_str)
+    return Spectrum(mzs=decoded[0], intensities=decoded[1])
 
 
 class MsmsRef(HasTraits):
@@ -361,27 +393,58 @@ def is_bad(self):
 
 def read_msms_refs(file_name: str, sep="\t", **kwargs) -> pd.DataFrame:
     """Read in msms refs from file with correct types"""
-    return pd.read_csv(
-        file_name,
-        sep=sep,
-        dtype={
-            "database": "string",
-            "id": "string",
-            "name": "string",
-            "spectrum": "string",
-            "decimal": np.ushort,
-            "precursor_mz": np.float64,
-            "polarity": polarity_type,
-            "adduct": "string",
-            "fragmentation_method": frag_method_type,
-            "collision_energy": "string",
-            "instrument": "string",
-            "instrument_type": instrument_type_type,
-            "formula": "string",
-            "exact_mass": np.float64,
-            "inchi_key": "string",
-            "inchi": "string",
-            "smiles": "string",
-        },
-        **kwargs,
-    )
+    logger.info("Reading in existing references from %s", file_name)
+    return pd.read_csv(file_name, sep=sep, dtype=REFS_TYPES, **kwargs)
+
+
+def get_empty_refs() -> pd.DataFrame:
+    """Returns an empty MSMS refs DataFrame with the correct columns and types"""
+    return pd.DataFrame(data={k: [] for k, v in REFS_TYPES.items()}).astype(REFS_TYPES)
+
+
+def valid_refs(refs_df: pd.DataFrame) -> int:
+    """Return number of rows that fail validation in refs_df. Info on failures to logger"" """
+    return sum([1 if MsmsRef(**row).is_bad() else 0 for row in refs_df.rows()])
+
+
+def add_cells_for_last_row(refs_sheet: sheet) -> None:
+    """Creates cells for the last row. Required to be able to export the row later"""
+    row(refs_sheet.rows - 1, [None for _, _ in enumerate(REFS_TYPES.keys())])
+
+
+def display_refs_edit_ui(
+    input_file_name: Optional[str], output_file_name: str, validate_input_file: bool = False
+) -> widgets.Box:
+    """Create GUI spreadsheet for edited MSMS references"""
+    if input_file_name is None:
+        old_df = get_empty_refs()
+    else:
+        logger.info("Reading in existsing references from %s", input_file_name)
+        old_df = read_msms_refs(input_file_name)
+        if validate_input_file:
+            old_pass = valid_refs(old_df)
+        logger.info("%s of %s references passed validation.", old_pass, len(old_df))
+
+    sheet1 = ipysheet.sheet(rows=1, columns=len(REFS_TYPES), column_headers=list(REFS_TYPES.keys()))
+    add_cells_for_last_row(sheet1)
+    auto_populate = widgets.Button(description="Auto-populate")
+    add_row = widgets.Button(description="Add row")
+    export = widgets.Button(description="Export")
+
+    def on_auto_populate_clicked(_):
+        logger.info("Auto-populate button clicked.")
+
+    def on_add_row_clicked(_):
+        sheet1.rows += 1
+        add_cells_for_last_row(sheet1)
+
+    def on_export_clicked(_):
+        old_df = read_msms_refs(input_file_name)
+        to_add_df = to_dataframe(sheet1).dropna()
+        updated_df = pd.concat([old_df, to_add_df], ignore_index=True)
+        updated_df.to_csv(output_file_name, sep="\t", index=False)
+
+    auto_populate.on_click(on_auto_populate_clicked)
+    add_row.on_click(on_add_row_clicked)
+    export.on_click(on_export_clicked)
+    return widgets.VBox([sheet1, widgets.HBox([add_row, auto_populate, export])])
diff --git a/pyproject.toml b/pyproject.toml
index 3296aa8c..4417e2cc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ module = [
     "humanize.*",
     "h5py.*",
     "ijson.*",
+    "ipysheet.*",
     "IPython.*",
     "ipywidgets.*",
     "labkey.*",
diff --git a/tests/unit/test_add_msms_ref.py b/tests/unit/test_add_msms_ref.py
index d18e05a3..6311cef8 100644
--- a/tests/unit/test_add_msms_ref.py
+++ b/tests/unit/test_add_msms_ref.py
@@ -1,11 +1,18 @@
 """ unit tests for add_msms_refs module """
 # pylint: disable=missing-function-docstring,line-too-long
 
+import json
 import pytest
 import traitlets
 
+from rdkit import Chem
+
 from metatlas.tools import add_msms_ref
 
+INCHI = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"
+INCHI_KEY = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+SMILES = "C1[C@@H]([C@@H](CO)O[C@H]1n1cnc2c(N)ncnc12)O"
+
 
 def tests_msms_ref01(mocker, compound):
     mocker.patch("metatlas.datastructures.metatlas_objects.retrieve", return_value=[compound])
@@ -23,8 +30,8 @@ def tests_msms_ref01(mocker, compound):
         instrument_type="LC-ESI-QTOF",
         formula="C10H13N5O3",
         exact_mass=251.101839276,
-        inchi_key="OLXZPDWKRNYJJZ-RRKCRQDMSA-N",
-        inchi="InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+        inchi_key=INCHI_KEY,
+        inchi=INCHI,
     )
 
 
@@ -46,5 +53,75 @@ def tests_msms_ref02(mocker, compound):
             formula="C10H13N5O3",
             exact_mass=251.101839276,
             inchi_key="xxx",
-            inchi="InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1",
+            inchi=INCHI,
         )
+
+
+def test_is_inchi01():
+    assert add_msms_ref.is_inchi(INCHI)
+    assert not add_msms_ref.is_inchi("f{INCHI}BLAH")
+    assert not add_msms_ref.is_inchi("")
+    assert not add_msms_ref.is_inchi("InChI=")
+
+
+def test_is_valid_inchi_pair():
+    assert add_msms_ref.is_valid_inchi_pair(INCHI, INCHI_KEY)
+    assert not add_msms_ref.is_valid_inchi_pair("", INCHI_KEY)
+    assert not add_msms_ref.is_valid_inchi_pair(INCHI, "")
+    assert not add_msms_ref.is_valid_inchi_pair(f"{INCHI}foobar!", INCHI_KEY)
+    assert not add_msms_ref.is_valid_inchi_pair(INCHI, f"{INCHI_KEY}foobar!")
+
+
+def test_is_valid_inchi_smiles_pair():
+    assert add_msms_ref.is_valid_inchi_smiles_pair(INCHI, SMILES)
+    assert not add_msms_ref.is_valid_inchi_smiles_pair("", SMILES)
+    assert not add_msms_ref.is_valid_inchi_smiles_pair(INCHI, "")
+    assert not add_msms_ref.is_valid_inchi_smiles_pair(f"{INCHI}foobar!", SMILES)
+    assert not add_msms_ref.is_valid_inchi_smiles_pair(INCHI, f"{SMILES}foobar!")
+
+
+def test_are_equal():
+    mol1 = Chem.inchi.MolFromInchi(INCHI)
+    mol2 = Chem.inchi.MolFromInchi("InChI=1S/H2O/h1H2")
+    assert add_msms_ref.are_equal(mol1, mol1)
+    assert add_msms_ref.are_equal(mol2, mol2)
+    assert not add_msms_ref.are_equal(mol1, mol2)
+    assert not add_msms_ref.are_equal(mol2, mol1)
+
+
+def test_is_synonym():
+    assert add_msms_ref.is_synonym("foobar", "FOO///bar///FooZoo///FooBar")
+    assert add_msms_ref.is_synonym("foobar", "FOOBAR")
+    assert add_msms_ref.is_synonym("FooBar", "foobar///bar///FooZoo///FooBeeear")
+    assert not add_msms_ref.is_synonym("foobar", "")
+    assert not add_msms_ref.is_synonym("FooBarz", "foobar///bar///FooZoo///FooBeeear")
+
+
+def test_spectrum01():
+    add_msms_ref.Spectrum(intensities=[1.2, 1, 4], mzs=[123, 145, 256.04])
+
+
+def test_spectrum02():
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[1.2, 1], mzs=[123, 145, 256.04])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[1.2, 1, 4], mzs=[123, 145])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[1.2, 1, 4], mzs=[])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[], mzs=[123])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[1], mzs=[-123])
+    with pytest.raises(traitlets.TraitError):
+        add_msms_ref.Spectrum(intensities=[1, 1], mzs=[123, 22])
+
+
+def test_str_to_spectrum():
+    spectrum1 = add_msms_ref.str_to_spectrum("[[123.456,145.789],[1.0,2.2]]")
+    assert spectrum1.mzs == [123.456, 145.789]
+    assert spectrum1.intensities == [1.0, 2.2]
+    spectrum2 = add_msms_ref.str_to_spectrum("[ [123.456, 145.789], [1.0, 2.2] ]")
+    assert spectrum2.mzs == [123.456, 145.789]
+    assert spectrum2.intensities == [1.0, 2.2]
+    with pytest.raises(json.JSONDecodeError):
+        add_msms_ref.str_to_spectrum("foobar")

From 0ffafb5fa3ee5b1f309da2bfcccb7422c85c02cf Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 1 Oct 2021 08:57:08 -0700
Subject: [PATCH 166/177] WIP - add_msms_refs

---
 metatlas/tools/add_msms_ref.py               | 451 +++++++++++++------
 metatlas/tools/cheminfo.py                   |  23 +
 notebooks/reference/Add_MSMS_Reference.ipynb |  77 +++-
 3 files changed, 395 insertions(+), 156 deletions(-)
 create mode 100644 metatlas/tools/cheminfo.py

diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
index dd8703b9..2d97c7a5 100644
--- a/metatlas/tools/add_msms_ref.py
+++ b/metatlas/tools/add_msms_ref.py
@@ -2,27 +2,32 @@
 import json
 import logging
 import math
+import os
 import uuid
 
-from typing import cast, Optional, List, TypedDict
+from typing import Any, cast, Dict, Optional, List, Sequence, Tuple, TypedDict
 
 import ipysheet
 import ipywidgets as widgets
+import matchms
 import numpy as np
 import pandas as pd
 import traitlets
 
-from ipysheet import sheet, row, to_dataframe
 from pandas.api.types import CategoricalDtype
 from rdkit import Chem
-from traitlets import TraitError, default, validate
-from traitlets import Float, HasTraits, Instance, Int, TraitType, Unicode
+from traitlets import Float, HasTraits, Instance, Int, TraitError, TraitType, Unicode, validate
 
 from metatlas.datastructures import metatlas_objects as metob
+from metatlas.io import metatlas_get_data_helper_fun as ma_data
+from metatlas.plots import dill2plots as dp
+from metatlas.tools import cheminfo
 
 logger = logging.getLogger(__name__)
 
-REFS_V3_FILE_NAME = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+NUM_STARTING_ROWS = 10
+
+COLUMN_WIDTH = 3
 
 POLARITIES = ["negative", "positive"]
 
@@ -53,14 +58,13 @@
     "QqIT",
     "QqLIT",
     "QqQ",
-    "cid",
 ]
 
 polarity_type = CategoricalDtype(categories=POLARITIES, ordered=True)
 frag_method_type = CategoricalDtype(categories=FRAG_METHODS, ordered=False)
 instrument_type_type = CategoricalDtype(categories=INSTRUMENT_TYPES, ordered=False)
 
-REFS_TYPES = {
+REFS_TYPES = {  # these values are pandas dtypes
     "database": "string",
     "id": "string",
     "name": "string",
@@ -80,6 +84,62 @@
     "smiles": "string",
 }
 
+REFS_DEFAULTS = {
+    "database": "",
+    "id": "",
+    "name": "",
+    "spectrum": "[[],[]]",
+    "decimal": 4,
+    "precursor_mz": 0,
+    "polarity": "positive",
+    "adduct": "",
+    "fragmentation_method": FRAG_METHODS[0],
+    "collision_energy": "0eV",
+    "instrument": "",
+    "instrument_type": INSTRUMENT_TYPES[0],
+    "formula": "",
+    "exact_mass": 0.0,
+    "inchi_key": "InChi=",
+    "inchi": "",
+    "smiles": "",
+}
+
+
+class Input():
+    def __init__(self, identifier, label, basic_type, validator):
+        self.identifier = identifier
+        self.label = label
+        self.basic_type = basic_type
+        self.validator = validator
+
+
+INPUTS = [
+    Input("name", "Name", "text", lambda x: not is_bad_name(x["name"], x["inchi"])),
+    Input("molecule_id", "Inchi or Smiles", "text", lambda x: inchi_or_smiles_to_molecule(x["molecule_id"]) is not None),
+    Input("adduct", "Adduct", "text", lambda x: valid_adduct(x["adduct"])),
+    Input("instrument", "Instrument", "text", lambda x: len(x["instrument"]) > 0,),
+    Input("instrument_type", "Instrument Type", INSTRUMENT_TYPES, lambda x: x["instrument_type"] in INSTRUMENT_TYPES),
+    Input("fragmentation_method", "Fragmentation Method", FRAG_METHODS, lambda x: x["fragmentation_method"] in FRAG_METHODS),
+    Input("mz_tolerance", "m/z Tolerance", "numeric", lambda x: is_pos_number(x["mz_tolerance"])),
+    Input("rt_min", "Min RT", "numeric", lambda x: is_valid_rt_min(x["rt_min"], x["rt_max"])),
+    Input("rt_max", "Max RT", "numeric", lambda x: is_valid_rt_max(x["rt_min"], x["rt_max"])),
+    Input("h5_file_name", "File Name (.h5)", "text", lambda x: is_readable_file(x["h5_file_name"])),
+]
+
+REFS_V3_FILE_NAME = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+
+
+def is_number(value):
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
+
+
+def is_pos_number(value):
+    return is_number(value) and float(value) >= 0
+
 
 def is_inchi(test_inchi: str) -> bool:
     """True if input can be parsed as an inchi string"""
@@ -107,13 +167,17 @@ def is_valid_inchi_smiles_pair(test_inchi: str, test_smiles: str) -> bool:
     return are_equal(mol_from_inchi, mol_from_smiles)
 
 
+def inchi_or_smiles_to_molecule(molecule_id: str) -> Optional[Chem.rdchem.Mol]:
+    return Chem.inchi.MolFromInchi(molecule_id) or Chem.MolFromSmiles(molecule_id)
+
+
 def get_compound(inchi_key: str) -> Optional[metob.Compound]:
     """
     Returns first compound from database matching inchi_key and with username pasteur
     or None if not found
     """
     try:
-        return metob.retrieve("Compounds", inchi_key=inchi_key, username="pasteur")[0]
+        return metob.retrieve("Compounds", inchi_key=inchi_key, username="*")[0]
     except IndexError:
         return None
 
@@ -133,6 +197,17 @@ def is_synonym(name: str, synonym_string: str) -> bool:
     return name.lower() in [x.lower() for x in synonym_string.split("///")]
 
 
+def is_bad_name(name: str, inchi: str) -> bool:
+    """Returns true if the molecule is in the database but name is not a synonym"""
+    if len(name) == 0:
+        return True
+    inchi_key = Chem.inchi.InchiToInchiKey(inchi)
+    compound_result = get_compound(inchi_key)
+    if compound_result is None:
+        return False
+    return not is_synonym(name, compound_result.name)
+
+
 class Proposal(TypedDict):
     """for use with traitlets.validate"""
 
@@ -147,19 +222,26 @@ class Spectrum(HasTraits):
     intensities: List[float] = traitlets.List(trait=Float())
     mzs: List[float] = traitlets.List(trait=Float())
 
-    def __init__(self, mzs: List[float], intensities: List[float], **kwargs) -> None:
+    def __init__(self, mzs: Sequence[float], intensities: Sequence[float], **kwargs) -> None:
         """required fields are inputs"""
         with self.hold_trait_notifications():
             super().__init__(**kwargs)
             self.intensities = intensities
             self.mzs = mzs
 
+    def __repr__(self) -> str:
+        """Return representation of data"""
+        nested_list_form = [[f"{m:.5f}" for m in self.mzs], [f"{x:.3f}" for x in self.intensities]]
+        return str(nested_list_form).replace('\'', '')
+
+    def __str__(self) -> str:
+        """Return string representation of data"""
+        return self.__repr__()
+
     @validate("intensities")
     def _valid_intensities(self, proposal: Proposal) -> List[float]:
         """validate positive values, not empty, and same length as mzs list"""
         value = cast(List[float], proposal["value"])
-        if len(value) == 0:
-            raise TraitError("length of intensities must be greater than 0")
         if len(value) != len(self.mzs):
             raise TraitError("length of intensities and mzs must be equal")
         if any(x <= 0 for x in value):
@@ -170,8 +252,6 @@ def _valid_intensities(self, proposal: Proposal) -> List[float]:
     def _valid_mzs(self, proposal: Proposal) -> List[float]:
         """validate positive values, not empty, and same length as intensities list"""
         value = cast(List[float], proposal["value"])
-        if len(value) == 0:
-            raise TraitError("length of mzs must be greater than 0")
         if len(value) != len(self.intensities):
             raise TraitError("length of intensities and mzs must be equal")
         if value != sorted(value):
@@ -183,153 +263,161 @@ def _valid_mzs(self, proposal: Proposal) -> List[float]:
 
 def str_to_spectrum(spectrum_str: str) -> Spectrum:
     """Converts a spectrum string into a Spectrum class instance"""
-    decoded = json.loads(spectrum_str)
+    if spectrum_str is None or spectrum_str == '':
+        return Spectrum(mzs=[], intensities=[])
+    try:
+        decoded = json.loads(spectrum_str)
+    except (TypeError, json.JSONDecodeError):
+        logger.error("Cannot convert '%s' to a Spectrum object, setting to empty spectrum", spectrum_str)
+        return Spectrum(mzs=[], intensities=[])
+    if len(decoded) != 2:
+        logger.error("Invalid specturm '%s'. Truncating elements after first two lists.", spectrum_str)
     return Spectrum(mzs=decoded[0], intensities=decoded[1])
 
 
+def _valid_enum(proposal, name, values_list):
+    """generic validation for enumerated type"""
+    if proposal["value"] not in values_list:
+        raise TraitError(f"{name} must be one of {', '.join(values_list)}")
+    return proposal["value"]
+
+
+def _valid_not_len_zero(proposal, name):
+    """generic validation for length greater than 0"""
+    if len(proposal["value"]) == 0:
+        raise TraitError(f"{name} cannot have a length of zero")
+    return proposal["value"]
+
+
+def _valid_positive(proposal, name):
+    """generic validation for positive value"""
+    if proposal["value"] < 0:
+        raise TraitError(f"{name} must be positive")
+    return proposal["value"]
+
+
+def valid_adduct(value):
+    adducts = matchms.importing.load_adducts_dict()
+    return matchms.utils.looks_like_adduct(value) and value in adducts
+
+
+def is_readable_file(value):
+    return os.path.isfile(value) and os.access(value, os.R_OK)
+
+
+def is_valid_rt_min(rt_min: Any, rt_max: Any) -> bool:
+    if not is_pos_number(rt_min):
+        return False
+    return is_pos_number(rt_max) and rt_min < rt_max
+
+
+def is_valid_rt_max(rt_min: Any, rt_max: Any) -> bool:
+    if not is_pos_number(rt_max):
+        return False
+    return is_pos_number(rt_min) and rt_min < rt_max
+
+
 class MsmsRef(HasTraits):
     # pylint: disable=too-few-public-methods,too-many-instance-attributes
     """one line from msms_refs file"""
-    database: str = Unicode()
-    id: str = Unicode(default=uuid.uuid4())
-    name: str = Unicode()
-    spectrum: Spectrum = Instance(klass=Spectrum)
-    decimal: np.ushort = Int(default_value=4)
-    precursor_mz: np.float64 = Float()
-    polarity: str = Unicode()
-    adduct: str = Unicode()
-    fragmentation_method: str = Unicode()
-    collision_energy: str = Unicode()
-    instrument: str = Unicode()
-    instrument_type: str = Unicode()
-    formula: str = Unicode()
-    exact_mass: np.float64 = Float()
-    inchi_key: str = Unicode()
-    inchi: str = Unicode()
-    smiles: str = Unicode()
+    database: str = Unicode(allow_none=True)
+    id: str = Unicode(default=uuid.uuid4(), allow_none=True)
+    name: str = Unicode(allow_none=True)
+    spectrum: Spectrum = Instance(klass=Spectrum, allow_none=True)
+    decimal: np.ushort = Int(default_value=4, allow_none=True)
+    precursor_mz: np.float64 = Float(allow_none=True)
+    polarity: str = Unicode(allow_none=True)
+    adduct: str = Unicode(allow_none=True)
+    fragmentation_method: str = Unicode(allow_none=True)
+    collision_energy: str = Unicode(allow_none=True)
+    instrument: str = Unicode(allow_none=True)
+    instrument_type: str = Unicode(allow_none=True)
+    formula: str = Unicode(allow_none=True)
+    exact_mass: np.float64 = Float(allow_none=True)
+    inchi_key: str = Unicode(allow_none=True)
+    inchi: str = Unicode(allow_none=True)
+    smiles: str = Unicode(allow_none=True)
 
     # pylint: disable=no-self-use,too-many-arguments
-    def __init__(
-        self,
-        name: str,
-        spectrum: Spectrum,
-        precursor_mz: np.float64,
-        polarity: str,
-        adduct: str,
-        fragmentation_method: str,
-        collision_energy: str,
-        instrument: str,
-        instrument_type: str,
-        formula: str,
-        exact_mass: np.float64,
-        inchi_key: str,
-        **kwargs,
-    ) -> None:
-        """required fields are inputs"""
-        with self.hold_trait_notifications():
-            super().__init__(**kwargs)
-            self.name = name
-            self.spectrum = spectrum
-            self.precursor_mz = precursor_mz
-            self.polarity = polarity
-            self.adduct = adduct
-            self.fragmentation_method = fragmentation_method
-            self.collision_energy = collision_energy
-            self.instrument = instrument
-            self.instrument_type = instrument_type
-            self.formula = formula
-            self.exact_mass = exact_mass
-            self.inchi_key = inchi_key
-        if self.is_bad():
-            raise ValueError("MSMS Ref does not pass validation")
-
-    def _valid_enum(self, proposal, name, values_list):
-        """generic validation for enumerated type"""
-        if proposal["value"] not in values_list:
-            raise TraitError(f"{name} must be one of {', '.join(values_list)}")
-        return proposal["value"]
+    def __repr__(self) -> str:
+        not_na_values = []
+        for k in REFS_TYPES:
+            value = getattr(self, k) if self.trait_has_value(k) else ''
+            not_na_values.append('' if value is None else str(value))
+        return ';'.join(not_na_values)
 
-    def _valid_not_len_zero(self, proposal, name):
-        """generic validation for length greater than 0"""
-        if len(proposal["value"]) == 0:
-            raise TraitError(f"{name} cannot have a length of zero")
-        return proposal["value"]
-
-    def _valid_positive(self, proposal, name):
-        """generic validation for positive value"""
-        if proposal["value"] < 0:
-            raise TraitError(f"{name} must be positive")
-        return proposal["value"]
+    def __str__(self) -> str:
+        return self.__repr__()
 
     @validate("database")
     def _valid_database(self, proposal):
         """valid if database string has positive length"""
-        return self._valid_not_len_zero(proposal, "database")
+        return _valid_not_len_zero(proposal, "database")
 
     @validate("id")
     def _valid_id(self, proposal):
         """valid if id string has positive length"""
-        return self._valid_not_len_zero(proposal, "id")
+        return _valid_not_len_zero(proposal, "id")
 
     @validate("name")
     def _valid_name(self, proposal):
         """valid if name string has positive length"""
-        return self._valid_not_len_zero(proposal, "name")
+        return _valid_not_len_zero(proposal, "name")
 
     @validate("decimal")
     def _valid_decimal(self, proposal):
         """valid if decimal is positive"""
-        return self._valid_positive(proposal, "decimal")
+        return _valid_positive(proposal, "decimal")
 
     @validate("precursor_mz")
     def _valid_precursor_mz(self, proposal):
         """valid if precursor_mz is positive"""
-        return self._valid_positive(proposal, "precursor_mz")
+        return _valid_positive(proposal, "precursor_mz")
 
     @validate("polarity")
     def _valid_polarity(self, proposal):
         """valid if polarity is in POLARITIES"""
-        return self._valid_enum(proposal, "polarity", POLARITIES)
+        return _valid_enum(proposal, "polarity", POLARITIES)
 
     @validate("adduct")
     def _valid_adduct(self, proposal):
         """valid if adduct string has positive length"""
-        return self._valid_not_len_zero(proposal, "adduct")
+        return _valid_not_len_zero(proposal, "adduct")
 
     @validate("fragmentation_method")
     def _valid_fragmentation_method(self, proposal):
         """valid if fragmentation_method in FRAG_METHODS"""
-        return self._valid_enum(proposal, "fragmentation_method", FRAG_METHODS)
+        return _valid_enum(proposal, "fragmentation_method", FRAG_METHODS)
 
     @validate("collision_energy")
     def _valid_collision_energy(self, proposal):
         """valid if collision_energy has positive length"""
-        return self._valid_not_len_zero(proposal, "collision_energy")
+        return _valid_not_len_zero(proposal, "collision_energy")
 
     @validate("instrument")
     def _valid_instrument(self, proposal):
         """valid if instrument has positive length"""
-        return self._valid_not_len_zero(proposal, "instrument")
+        return _valid_not_len_zero(proposal, "instrument")
 
     @validate("instrument_type")
     def _valid_instrument_type(self, proposal):
         """valid if instrument_type is in INSTRUMENT_TYPES"""
-        return self._valid_enum(proposal, "instrument_type", INSTRUMENT_TYPES)
+        return _valid_enum(proposal, "instrument_type", INSTRUMENT_TYPES)
 
     @validate("formula")
     def _valid_formula(self, proposal):
         """valid if formula has positive length"""
-        return self._valid_not_len_zero(proposal, "formula")
+        return _valid_not_len_zero(proposal, "formula")
 
     @validate("exact_mass")
     def _valid_exact_mass(self, proposal):
         """valid if exact_mass is positive"""
-        return self._valid_positive(proposal, "exact_mass")
+        return _valid_positive(proposal, "exact_mass")
 
     @validate("inchi_key")
     def _valid_inchi_key(self, proposal):
         """valid if inchi_key has positive length"""
-        return self._valid_not_len_zero(proposal, "inchi_key")
+        return _valid_not_len_zero(proposal, "inchi_key")
 
     @validate("inchi")
     def _valid_inchi(self, proposal):
@@ -347,27 +435,43 @@ def _valid_smiles(self, proposal):
             raise TraitError("inchi and smiles do not represent the same molecule")
         return proposal["value"]
 
-    @default("smiles")
+    @traitlets.default("smiles")
     def _get_default_smiles(self):
         """generate smiles from inchi"""
-        return Chem.MolToSmiles(Chem.inchi.MolFromInchi(self.inchi))
+        if self.inchi is not None and self.inchi != '':
+            return Chem.MolToSmiles(Chem.inchi.MolFromInchi(self.inchi))
+        return None
 
-    def is_bad(self):
+    def has_missing_fields(self) -> bool:
+        """Returns True if there are fields with None values, logs an error message for each field missing"""
+        out = False
+        for name in REFS_TYPES:
+            value = getattr(self, name, None)
+            if value is None or value == '':
+                out = True
+                logger.error("No '%s' field in %s", name, str(self))
+        return out
+
+    def is_bad(self) -> bool:
         """
         If returns True, then the inputs are bad, but if returns False do not assume the inputs are good
         returning False only means that there is no evidence the inputs are bad. Conclusively saying
         the inputs are good for unusual chemicals that are not in databases is hard.
         """
-        # pylint: disable=too-many-return-statements
+        bad = self.has_missing_fields()
         if self.fragmentation_method not in FRAG_METHODS:
             logger.error('Invalid fragmentation method "%s" for %s.', self.fragmentation_method, self.name)
-            return True
+            bad = True
         if not is_valid_inchi_pair(self.inchi, self.inchi_key):
             logger.error("Invalid inchi/inchi_key pair for %s.", self.name)
-            return True
+            bad = True
+        if not is_valid_inchi_smiles_pair(self.inchi, self.smiles):
+            logger.error("Invalid inchi/smiles pair for %s.", self.name)
+            bad = True
         results = metob.retrieve("compounds", username="*", inchi_key=self.inchi_key)
         if len(results) == 0:
-            return False
+            logger.warning("Could not find inchi_key=%s in database (name=%s), so skipping some tests.", self.inchi_key, self.name)
+            return bad
         ref_compound = results[0]
         if self.formula != ref_compound.formula:
             logger.error(
@@ -376,7 +480,7 @@ def is_bad(self):
                 self.name,
                 ref_compound.formula,
             )
-            return True
+            bad = True
         if not math.isclose(self.exact_mass, ref_compound.mono_isotopic_molecular_weight, rel_tol=1e-9):
             logger.error(
                 "Exact mass %s for %s does not match value %s in database.",
@@ -384,17 +488,18 @@ def is_bad(self):
                 self.name,
                 ref_compound.mono_isotopic_molecular_weight,
             )
-            return True
+            bad = True
         if not is_synonym(self.name, ref_compound.synonyms):
-            logger.error("Inchi_key %s does not contain name %s in database.", self.inchi_key, self.name)
-            return True
-        return False
+            logger.error("The entry with inchi_key=%s does not contain name '%s' in database.", self.inchi_key, self.name)
+            bad = True
+        return bad
 
 
 def read_msms_refs(file_name: str, sep="\t", **kwargs) -> pd.DataFrame:
     """Read in msms refs from file with correct types"""
-    logger.info("Reading in existing references from %s", file_name)
-    return pd.read_csv(file_name, sep=sep, dtype=REFS_TYPES, **kwargs)
+    file_df = pd.read_csv(file_name, sep=sep, dtype=REFS_TYPES, **kwargs)
+    logger.info("Read in %d existing references from %s", len(file_df), file_name)
+    return file_df
 
 
 def get_empty_refs() -> pd.DataFrame:
@@ -402,49 +507,97 @@ def get_empty_refs() -> pd.DataFrame:
     return pd.DataFrame(data={k: [] for k, v in REFS_TYPES.items()}).astype(REFS_TYPES)
 
 
-def valid_refs(refs_df: pd.DataFrame) -> int:
+def df_row_to_ref(data: dict) -> MsmsRef:
+    """ converts a row from df.to_dict(orient='records') to a MsmsRef instance"""
+    data_minus_na = {k: v for k, v in data.items() if pd.notna(v)}
+    if 'spectrum' in data_minus_na:
+        data_minus_na['spectrum'] = str_to_spectrum(data_minus_na['spectrum'])
+    return MsmsRef(**data_minus_na)
+
+
+def get_num_bad_refs(refs_df: pd.DataFrame) -> int:
     """Return number of rows that fail validation in refs_df. Info on failures to logger"" """
-    return sum([1 if MsmsRef(**row).is_bad() else 0 for row in refs_df.rows()])
+    return sum([0] + [0 if df_row_to_ref(row).is_bad() else 1 for row in refs_df.to_dict(orient='records')])
+
+
+def in_rt_mz_ranges(rt, rt_min, rt_max, mz, mz_target, mz_tol):
+    return dp.within_tolerance(mz, mz_target, mz_tol) and (rt_min <= rt <= rt_max)
 
 
-def add_cells_for_last_row(refs_sheet: sheet) -> None:
-    """Creates cells for the last row. Required to be able to export the row later"""
-    row(refs_sheet.rows - 1, [None for _, _ in enumerate(REFS_TYPES.keys())])
+def extract_most_intense(in_df, rt_min, rt_max, mz_target, mz_tol):
+    group_cols = ['rt', 'polarity', 'precursor_MZ', 'precursor_intensity', 'collision_energy']
+    in_tol_df = in_df.groupby(group_cols).filter(lambda x: in_rt_mz_ranges(x.iloc[0]['rt'], rt_min, rt_max, x.iloc[0]['precursor_MZ'], mz_target, mz_tol))
+    precursor_intensity_max = in_tol_df['precursor_intensity'].max()
+    most_intense_df = in_tol_df.groupby(group_cols).filter(lambda x: precursor_intensity_max == x.iloc[0]['precursor_intensity'])
+    spectrum = Spectrum(tuple(most_intense_df['mz']), tuple(most_intense_df['i']))
+    most_intense = most_intense_df.iloc[0]
+    return (spectrum, most_intense['rt'], most_intense['precursor_MZ'], most_intense['collision_energy'])
 
 
-def display_refs_edit_ui(
-    input_file_name: Optional[str], output_file_name: str, validate_input_file: bool = False
-) -> widgets.Box:
-    """Create GUI spreadsheet for edited MSMS references"""
-    if input_file_name is None:
-        old_df = get_empty_refs()
+def extract_spectrum(h5_file_name, molecule_id, adduct, rt_min, rt_max, mz_tolerance) -> Spectrum:
+    if matchms.utils.is_valid_inchi(molecule_id):
+        mol = Chem.MolFromInchi(molecule_id)
+    elif matchms.utils.is_valid_smiles(molecule_id):
+        mol = Chem.MolFromSmiles(molecule_id)
     else:
-        logger.info("Reading in existsing references from %s", input_file_name)
-        old_df = read_msms_refs(input_file_name)
-        if validate_input_file:
-            old_pass = valid_refs(old_df)
-        logger.info("%s of %s references passed validation.", old_pass, len(old_df))
-
-    sheet1 = ipysheet.sheet(rows=1, columns=len(REFS_TYPES), column_headers=list(REFS_TYPES.keys()))
-    add_cells_for_last_row(sheet1)
-    auto_populate = widgets.Button(description="Auto-populate")
-    add_row = widgets.Button(description="Add row")
-    export = widgets.Button(description="Export")
-
-    def on_auto_populate_clicked(_):
-        logger.info("Auto-populate button clicked.")
-
-    def on_add_row_clicked(_):
-        sheet1.rows += 1
-        add_cells_for_last_row(sheet1)
-
-    def on_export_clicked(_):
-        old_df = read_msms_refs(input_file_name)
-        to_add_df = to_dataframe(sheet1).dropna()
-        updated_df = pd.concat([old_df, to_add_df], ignore_index=True)
-        updated_df.to_csv(output_file_name, sep="\t", index=False)
-
-    auto_populate.on_click(on_auto_populate_clicked)
-    add_row.on_click(on_add_row_clicked)
-    export.on_click(on_export_clicked)
-    return widgets.VBox([sheet1, widgets.HBox([add_row, auto_populate, export])])
+        raise ValueError(f"molecule_id '{molecule_id}' is not a valid inchi or smiles string")
+    h5_df = ma_data.df_container_from_metatlas_file(h5_file_name)
+    parent_mass = Chem.Descriptors.ExactMolWt(mol)
+    precursor_mz = cheminfo.get_precursor_mz(parent_mass, adduct)
+    return extract_most_intense(h5_df, rt_min, rt_max, precursor_mz, mz_tolerance)
+
+
+def sheet_row_to_spectrum(input_sheet, input_defs, row_num) -> Spectrum:
+    row_dict = row_list_to_dict(input_sheet.cells[0].value[row_num], input_defs)
+    return extract_spectrum(row_dict["h5_file_name"], row_dict["inchi"], row_dict["adduct"], float(row_dict["rt_min"]), float(row_dict["rt_max"]), float(row_dict["mz_tolerance"]))
+
+
+def row_col_to_cell_num(in_sheet: ipysheet.sheet, row_num: int, col_num: int) -> int:
+    return in_sheet.columns * row_num + col_num
+
+
+def row_list_to_dict(values: List[Any], input_defs: List[Input]) -> Dict[str, Any]:
+    return dict(zip([x.identifier for x in input_defs], values))
+
+
+def get_invalid_cells(input_sheet: ipysheet.sheet, input_defs: List[Input]) -> List[Tuple[int, str]]:
+    bad_cells = []
+    for row_num, values in enumerate(input_sheet.cells[0].value):
+        row_dict = row_list_to_dict(input_defs, values)
+        for column_num, current_def in enumerate(input_defs):
+            try:
+                is_good = current_def.validators(row_dict)
+            except Exception:
+                is_good = False
+            if not is_good:
+                bad_cells.append((row_num, current_def.label))
+    return bad_cells
+
+
+def spectrums_from_sheet(input_sheet):
+    for row_num in range(input_sheet.rows):
+        sheet_row_to_spectrum(input_sheet, INPUTS, row_num)
+    pass
+
+
+def display_inputs_ui(num_rows: int) -> widgets.Box:
+    """Display spreadsheet for entering input values"""
+    col_headers = [x.label for x in INPUTS]
+    input_sheet = ipysheet.sheet(rows=num_rows, columns=len(INPUTS), column_headers=col_headers, column_width=COLUMN_WIDTH, column_resizing=False)
+    ipysheet.easy.cell_range([['']*len(INPUTS)]*num_rows)
+    extract = widgets.Button(description="Extract Spectrums")
+    log_box = widgets.Output()
+
+    def on_extract_clicked(_):
+        log_box.clear_output()
+        invalid = get_invalid_cells(input_sheet, [x.validator for x in INPUTS])
+        with log_box:
+            for row_num, col_name in invalid:
+                logger.error("In row %d, invalid value for '%s'.", row_num+1, col_name)
+            if len(invalid) > 0:
+                logger.error("All inputs must pass validation before spectrum extraction")
+                return
+        spectrums_from_sheet(input_sheet)
+
+    extract.on_click(on_extract_clicked)
+    return widgets.VBox([input_sheet, extract, log_box])
diff --git a/metatlas/tools/cheminfo.py b/metatlas/tools/cheminfo.py
new file mode 100644
index 00000000..5f1fc5ba
--- /dev/null
+++ b/metatlas/tools/cheminfo.py
@@ -0,0 +1,23 @@
+"""cheminformatics related functions"""
+
+import matchms
+import numpy as np
+
+
+def get_parent_mass(precursor_mz: float, adduct: str) -> float:
+    """Returns the mass of the input molecule that would result in the supplied precursor_mz and adduct"""
+    dummy = matchms.Spectrum(mz=np.array([]),
+                             intensities=np.array([]),
+                             metadata={"precursor_mz": precursor_mz, "adduct": adduct})
+    updated = matchms.filtering.add_parent_mass(dummy)
+    return updated.metadata['parent_mass']
+
+
+def get_precursor_mz(parent_mass: float, adduct: str) -> float:
+    """For an input molecule with parent_mass that generates adduct, return the resutling precursor_mz"""
+    adducts = matchms.importing.load_adducts_dict()
+    if adduct not in adducts:
+        raise KeyError("Adduct '%s' is not supported")
+    multiplier = adducts[adduct]["mass_multiplier"]
+    correction_mass = adducts[adduct]["correction_mass"]
+    return (parent_mass + correction_mass) / multiplier
diff --git a/notebooks/reference/Add_MSMS_Reference.ipynb b/notebooks/reference/Add_MSMS_Reference.ipynb
index 8c26f415..4b6cf06a 100644
--- a/notebooks/reference/Add_MSMS_Reference.ipynb
+++ b/notebooks/reference/Add_MSMS_Reference.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "4a07e86e-ff53-4cf6-8f61-fad398f9441c",
    "metadata": {
     "tags": [
@@ -36,6 +36,9 @@
     "# if True, you won't be able to export a file if input_file_name points to a file with any bad rows\n",
     "validate_input_file = False\n",
     "\n",
+    "# how many spectrum to import\n",
+    "num_rows_to_add = 5\n",
+    "\n",
     "# absolute path of where you want this notebook to store the new MSMS refs\n",
     "output_file_name = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects/my_msms_refs.tab\"\n",
     "\n",
@@ -50,18 +53,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "2106bcab-ecfa-47cb-91e3-682ad57bd4ee",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "metatlas_repo_path = '/src'\n",
+    "input_file_name = \"/work/short_refs.tab\"\n",
+    "output_file_name = \"/work/updated_refs.tab\"\n",
+    "num_rows_to_add = 2"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "b56b1e89-bd9c-4c11-896f-576c14a99c20",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2021-10-01 03:50:15 \u001b[32mINFO    \u001b[0m Running on git commit: b'a8111d3ca4d874cb29e6b97b4a164e078363ad8f'\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>.container { width:100% !important; }</style>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2021-10-01 03:50:15 \u001b[32mINFO    \u001b[0m NERSC=False\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27a4ea30b18d4fc3bb2d926b34a7bafa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Sheet(cells=(Cell(column_end=9, column_start=0, row_end=1, row_start=0, squeeze_column=False, s…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# pylint: disable=wrong-import-position,import-error\n",
     "import logging  # noqa: E402\n",
@@ -96,11 +145,25 @@
     "    logging.critical(\"A newer version of metatlas_repo is required to use this notebook.\")\n",
     "    raise ImportError from err\n",
     "from metatlas.tools import notebook  # noqa: E402\n",
-    "from metatlas.tools import add_msms_ref as amr  # noqa: E402\n",
+    "\n",
     "\n",
     "notebook.setup(log_level)\n",
-    "amr.display_refs_edit_ui(input_file_name, output_file_name, validate_input_file=validate_input_file)"
+    "from metatlas.tools import add_msms_ref as amr  # noqa: E402\n",
+    "\n",
+    "\n",
+    "import os\n",
+    "os.chdir('/work')\n",
+    "\n",
+    "amr.display_inputs_ui(num_rows_to_add)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b08bbc83-e9cb-495b-9689-184e5b8e57c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From a1d18550573843708d0eb77fea1f8fe91ab71e9d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 5 Oct 2021 17:15:20 -0700
Subject: [PATCH 167/177] refactor add_msms_refs, and system level test

---
 docker/Dockerfile.ci01                        |  16 +-
 docker/local_jupyter.sh                       |   2 +-
 docker/requirements.txt                       |   1 +
 .../compounds/structure_cleaning.py           |  24 +-
 metatlas/tools/add_msms_ref.py                | 486 +++++++++++-------
 metatlas/tools/cheminfo.py                    | 116 ++++-
 notebooks/reference/Add_MSMS_Reference.ipynb  |  91 +---
 noxfile.py                                    |   1 +
 pyproject.toml                                |   1 +
 tests/system/test_add_msms_ref.py             |  34 ++
 tests/unit/test_add_msms_ref.py               | 102 ++--
 tests/unit/test_cheminfo.py                   |  53 ++
 12 files changed, 592 insertions(+), 335 deletions(-)
 create mode 100644 tests/system/test_add_msms_ref.py
 create mode 100644 tests/unit/test_cheminfo.py

diff --git a/docker/Dockerfile.ci01 b/docker/Dockerfile.ci01
index a14598f8..7c5e1980 100644
--- a/docker/Dockerfile.ci01
+++ b/docker/Dockerfile.ci01
@@ -1,4 +1,4 @@
-FROM python:3.8-slim-buster
+FROM python:3.8-slim-bullseye
 
 # https://portal.nersc.gov/cfs/m2650/metatlas/test_data
 # serves from /global/cfs/cdirs/m2650/www/metatlas/test_data
@@ -27,6 +27,13 @@ ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5 $H5_DIR/
 ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5 $H5_DIR/
 
+# also get the mzML files, as these are used in matchms within add_msms_refs
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.mzML $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.mzML $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.mzML $H5_DIR/
+ADD $BASE_DATA_URL/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.mzML $H5_DIR/
+
+
 ADD $BASE_DATA_URL/meta_atlas.sqlite3 /work/root_workspace.db
 
 RUN mkdir -p /root/.local/share/jupyter/kernels/metatlas-targeted
@@ -34,7 +41,10 @@ COPY kernel.json /root/.local/share/jupyter/kernels/metatlas-targeted/kernel.jso
 
 WORKDIR /work
 
-RUN apt-get update && apt-get install -y libxrender1 && \
+RUN apt-get update && apt-get install -y \
+	libxrender1 \
+	nodejs \
+	npm && \
     rm -rf /var/lib/apt/lists/*
 
-CMD ["/usr/local/bin/jupyter", "nbclassic", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
+CMD ["/usr/local/bin/jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/docker/local_jupyter.sh b/docker/local_jupyter.sh
index df632dfa..61379f19 100755
--- a/docker/local_jupyter.sh
+++ b/docker/local_jupyter.sh
@@ -5,7 +5,7 @@ set -euf -o pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 REPO_DIR=$(dirname "$SCRIPT_DIR")
 OUT_DIR="${SCRIPT_DIR}/out"
-IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.2'
+IMAGE='registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.4'
 PORT=8888
 
 while [[ "$#" -gt 0 ]]; do
diff --git a/docker/requirements.txt b/docker/requirements.txt
index a22e324a..4588a851 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -10,6 +10,7 @@ ipysheet==0.5.0
 ipywidgets==7.6.3
 jupyterlab==3.0.16
 matplotlib==3.4.2
+matchms==0.9.2
 oauth2client==4.1.3
 pandas==1.2.4
 papermill==2.3.3
diff --git a/metatlas/interfaces/compounds/structure_cleaning.py b/metatlas/interfaces/compounds/structure_cleaning.py
index bc9b805b..013e8311 100644
--- a/metatlas/interfaces/compounds/structure_cleaning.py
+++ b/metatlas/interfaces/compounds/structure_cleaning.py
@@ -1,10 +1,12 @@
-from __future__ import absolute_import
 import sys
-sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
+
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit.Chem import Descriptors
 from rdkit.Chem import rdMolDescriptors
+from rdkit.Chem.rdmolops import SanitizeMol
+from rdkit.Chem.inchi import MolToInchi, MolFromInchi
+
 
 """ contribution from Hans de Winter """
 def _InitialiseNeutralisationReactions():
@@ -33,9 +35,9 @@ def _InitialiseNeutralisationReactions():
 def neutralizeRadicals(mol):
     for a in mol.GetAtoms():
         if a.GetNumRadicalElectrons()==1 and a.GetFormalCharge()==1:
-            a.SetNumRadicalElectrons(0)         
+            a.SetNumRadicalElectrons(0)
             a.SetFormalCharge(0)
-            
+
 _reactions=None
 def NeutraliseCharges(mol, reactions=None):
     global _reactions
@@ -72,7 +74,7 @@ def desalt(mol):
     mol = MolFromInchi(mol)
     SanitizeMol(mol)
     d = Chem.rdmolops.GetMolFrags(mol) #these are atom indices
-    if len(d) == 1: #If there are fragments or multiple molecules this will be greater than 1 
+    if len(d) == 1: #If there are fragments or multiple molecules this will be greater than 1
         return mol,False
     my_smiles=Chem.MolToSmiles(mol,True)
     parent_atom_count=0;
@@ -101,11 +103,11 @@ def desalt_compounds_in_dataframe(x):
                 return c[0]
             else:
                 return x
-    
+
 
 def neutralize_compounds_in_dataframe(x):
     '''
-    df.ROMol = df.ROMol.apply(neutralize_compounds_in_dataframe)    
+    df.ROMol = df.ROMol.apply(neutralize_compounds_in_dataframe)
     '''
     if x:
         if x.GetNumAtoms()> 0:
@@ -117,11 +119,11 @@ def neutralize_compounds_in_dataframe(x):
                 pass
             if neutral_mol:
                 return neutral_mol
-            
+
 def calculate_num_radicals_in_dataframe(x):
     num_radicals = 0.0
     if x:
-        num_radicals = Descriptors.NumRadicalElectrons(x)        
+        num_radicals = Descriptors.NumRadicalElectrons(x)
     return num_radicals
 
 def calculate_formula_in_dataframe(x):
@@ -162,10 +164,10 @@ def calculate_inchikey_in_dataframe(x):
         try:
             ik = Chem.InchiToInchiKey(x)
         except:
-            pass#This fails when can't kekulize mol.  Carbo-cations are the culprit usually. 
+            pass#This fails when can't kekulize mol.  Carbo-cations are the culprit usually.
     return ik
 
 def calculate_charge_in_dataframe(x):
     if x:
         my_charge = Chem.GetFormalCharge(x)
-        return my_charge
\ No newline at end of file
+        return my_charge
diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
index 2d97c7a5..15e53f09 100644
--- a/metatlas/tools/add_msms_ref.py
+++ b/metatlas/tools/add_msms_ref.py
@@ -1,11 +1,13 @@
 """ For minipulating msms_refs files """
+# pylint: disable=too-few-public-methods,missing-function-docstring,too-many-arguments
+
 import json
 import logging
 import math
 import os
 import uuid
 
-from typing import Any, cast, Dict, Optional, List, Sequence, Tuple, TypedDict
+from typing import Any, cast, Dict, Optional, List, Mapping, Sequence, Tuple, TypedDict, Union
 
 import ipysheet
 import ipywidgets as widgets
@@ -16,6 +18,7 @@
 
 from pandas.api.types import CategoricalDtype
 from rdkit import Chem
+from rdkit.Chem.Descriptors import ExactMolWt
 from traitlets import Float, HasTraits, Instance, Int, TraitError, TraitType, Unicode, validate
 
 from metatlas.datastructures import metatlas_objects as metob
@@ -25,9 +28,9 @@
 
 logger = logging.getLogger(__name__)
 
-NUM_STARTING_ROWS = 10
+COLUMN_WIDTH = 5
 
-COLUMN_WIDTH = 3
+NEW_REFS_DB_NAME = "NorthernLabAddition:NoDB"
 
 POLARITIES = ["negative", "positive"]
 
@@ -84,28 +87,26 @@
     "smiles": "string",
 }
 
-REFS_DEFAULTS = {
-    "database": "",
-    "id": "",
-    "name": "",
-    "spectrum": "[[],[]]",
-    "decimal": 4,
-    "precursor_mz": 0,
-    "polarity": "positive",
-    "adduct": "",
-    "fragmentation_method": FRAG_METHODS[0],
-    "collision_energy": "0eV",
-    "instrument": "",
-    "instrument_type": INSTRUMENT_TYPES[0],
-    "formula": "",
-    "exact_mass": 0.0,
-    "inchi_key": "InChi=",
-    "inchi": "",
-    "smiles": "",
-}
+HELP_TEXT = (
+    "Compound Name must be in the synonym list for the corresponding database entry\n"
+    "Inchi or Smiles cannot be an Inchi Key\n"
+    "The supported adducts can be found at "
+    "https://github.com/matchms/matchms/blob/master/matchms/data/known_adducts_table.csv\n"
+    "Instrument should contain a model name\n"
+    f"Allowed values for Instrument Type are {', '.join(INSTRUMENT_TYPES)}.\n"
+    f"Allowed values for Frag. Method are {', '.join(FRAG_METHODS)}.\n"
+    "m/z Tol. is a relative tolerance value in expressed units of parts per million\n"
+    "File Name should contain an absolute path.\n"
+    "\n"
+    "All rows must be filled in. In the parameter block, set num_rows_to_add to change the number of rows."
+)
+
+GROUP_SPECTRUM_COLS = ["rt", "polarity", "precursor_MZ", "precursor_intensity", "collision_energy"]
 
 
-class Input():
+class Input:
+    """Properties of an input to the spectrum extraction"""
+
     def __init__(self, identifier, label, basic_type, validator):
         self.identifier = identifier
         self.label = label
@@ -114,22 +115,62 @@ def __init__(self, identifier, label, basic_type, validator):
 
 
 INPUTS = [
-    Input("name", "Name", "text", lambda x: not is_bad_name(x["name"], x["inchi"])),
-    Input("molecule_id", "Inchi or Smiles", "text", lambda x: inchi_or_smiles_to_molecule(x["molecule_id"]) is not None),
-    Input("adduct", "Adduct", "text", lambda x: valid_adduct(x["adduct"])),
-    Input("instrument", "Instrument", "text", lambda x: len(x["instrument"]) > 0,),
-    Input("instrument_type", "Instrument Type", INSTRUMENT_TYPES, lambda x: x["instrument_type"] in INSTRUMENT_TYPES),
-    Input("fragmentation_method", "Fragmentation Method", FRAG_METHODS, lambda x: x["fragmentation_method"] in FRAG_METHODS),
-    Input("mz_tolerance", "m/z Tolerance", "numeric", lambda x: is_pos_number(x["mz_tolerance"])),
-    Input("rt_min", "Min RT", "numeric", lambda x: is_valid_rt_min(x["rt_min"], x["rt_max"])),
-    Input("rt_max", "Max RT", "numeric", lambda x: is_valid_rt_max(x["rt_min"], x["rt_max"])),
+    Input("name", "Compound Name", "text", lambda x: not is_bad_name(x["name"], x["molecule_id"])),
+    Input(
+        "molecule_id",
+        "Inchi or Smiles",
+        "text",
+        lambda x: cheminfo.inchi_or_smiles_to_molecule(x["molecule_id"]) is not None,
+    ),
+    Input("adduct", "Adduct", "text", lambda x: cheminfo.valid_adduct(x["adduct"])),
+    Input(
+        "instrument",
+        "Instrument",
+        "text",
+        lambda x: len(x["instrument"]) > 0,
+    ),
+    Input(
+        "instrument_type",
+        "Instrument Type",
+        INSTRUMENT_TYPES,
+        lambda x: x["instrument_type"] in INSTRUMENT_TYPES,
+    ),
+    Input(
+        "fragmentation_method",
+        "Frag. Method",
+        FRAG_METHODS,
+        lambda x: x["fragmentation_method"] in FRAG_METHODS,
+    ),
+    Input("mz_tolerance", "m/z Tol. [ppm]", "numeric", lambda x: is_pos_number(x["mz_tolerance"])),
+    Input("rt_min", "Min RT [min.]", "numeric", lambda x: is_valid_rt_min(x["rt_min"], x["rt_max"])),
+    Input("rt_max", "Max RT [min.]", "numeric", lambda x: is_valid_rt_max(x["rt_min"], x["rt_max"])),
     Input("h5_file_name", "File Name (.h5)", "text", lambda x: is_readable_file(x["h5_file_name"])),
 ]
 
-REFS_V3_FILE_NAME = "/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab"
+
+class InputDict(TypedDict, total=False):
+    """Type for holding one row from input sheet"""
+    name: str
+    molecule_id: str
+    adduct: str
+    instrument: str
+    instrument_type: str
+    fragmentation_method: str
+    h5_file_name: str
+    mz_tolerance: float
+    rt_min: float
+    rt_max: float
 
 
-def is_number(value):
+def to_input_dict(data: Mapping[str, Any]) -> InputDict:
+    result = InputDict()
+    for key, key_type in InputDict.__annotations__.items():  # pylint: disable=no-member
+        if key in data:
+            result[key] = key_type(data[key])  # type: ignore
+    return result
+
+
+def is_number(value: Any) -> bool:
     try:
         float(value)
         return True
@@ -137,38 +178,15 @@ def is_number(value):
         return False
 
 
-def is_pos_number(value):
-    return is_number(value) and float(value) >= 0
-
-
-def is_inchi(test_inchi: str) -> bool:
-    """True if input can be parsed as an inchi string"""
-    return Chem.inchi.MolFromInchi(test_inchi) is not None
-
-
-def is_valid_inchi_pair(test_inchi: str, test_inchi_key: str) -> bool:
-    """True if if test_inchi has the inchi key test_inchi_key"""
-    if not is_inchi(test_inchi):
-        return False
-    return test_inchi_key == Chem.inchi.InchiToInchiKey(test_inchi)
-
-
-def is_valid_inchi_smiles_pair(test_inchi: str, test_smiles: str) -> bool:
-    """
-    True if test_inchi and test_smiles have the same structure.
-    Also True if test_smiles is None and test_inchi is a valid inchi
-    """
-    if pd.isna(test_smiles):
-        return is_inchi(test_inchi)
-    mol_from_inchi = Chem.inchi.MolFromInchi(test_inchi)
-    mol_from_smiles = Chem.MolFromSmiles(test_smiles)
-    if mol_from_inchi is None or mol_from_smiles is None:
-        return False
-    return are_equal(mol_from_inchi, mol_from_smiles)
+def to_float(value: str) -> Union[float, str]:
+    try:
+        return float(value)
+    except ValueError:
+        return value
 
 
-def inchi_or_smiles_to_molecule(molecule_id: str) -> Optional[Chem.rdchem.Mol]:
-    return Chem.inchi.MolFromInchi(molecule_id) or Chem.MolFromSmiles(molecule_id)
+def is_pos_number(value: Any) -> bool:
+    return is_number(value) and float(value) >= 0
 
 
 def get_compound(inchi_key: str) -> Optional[metob.Compound]:
@@ -182,21 +200,6 @@ def get_compound(inchi_key: str) -> Optional[metob.Compound]:
         return None
 
 
-def are_equal(molecule1: Chem.rdchem.Mol, molecule2: Chem.rdchem.Mol) -> bool:
-    """True if both molecules are substructures of each other"""
-    return molecule1.HasSubstructMatch(molecule2) and molecule2.HasSubstructMatch(molecule1)
-
-
-def is_synonym(name: str, synonym_string: str) -> bool:
-    """
-    Inputs:
-        name: string to check for within synonym_string
-        synonym_string: string with /// between names
-    Returns True if case insensitive match of name to full name in synonym_string
-    """
-    return name.lower() in [x.lower() for x in synonym_string.split("///")]
-
-
 def is_bad_name(name: str, inchi: str) -> bool:
     """Returns true if the molecule is in the database but name is not a synonym"""
     if len(name) == 0:
@@ -205,7 +208,7 @@ def is_bad_name(name: str, inchi: str) -> bool:
     compound_result = get_compound(inchi_key)
     if compound_result is None:
         return False
-    return not is_synonym(name, compound_result.name)
+    return not cheminfo.is_synonym(name, compound_result.name)
 
 
 class Proposal(TypedDict):
@@ -226,13 +229,13 @@ def __init__(self, mzs: Sequence[float], intensities: Sequence[float], **kwargs)
         """required fields are inputs"""
         with self.hold_trait_notifications():
             super().__init__(**kwargs)
-            self.intensities = intensities
-            self.mzs = mzs
+            self.intensities = list(intensities)
+            self.mzs = list(mzs)
 
     def __repr__(self) -> str:
         """Return representation of data"""
         nested_list_form = [[f"{m:.5f}" for m in self.mzs], [f"{x:.3f}" for x in self.intensities]]
-        return str(nested_list_form).replace('\'', '')
+        return str(nested_list_form).replace("'", "")
 
     def __str__(self) -> str:
         """Return string representation of data"""
@@ -263,7 +266,7 @@ def _valid_mzs(self, proposal: Proposal) -> List[float]:
 
 def str_to_spectrum(spectrum_str: str) -> Spectrum:
     """Converts a spectrum string into a Spectrum class instance"""
-    if spectrum_str is None or spectrum_str == '':
+    if spectrum_str is None or spectrum_str == "":
         return Spectrum(mzs=[], intensities=[])
     try:
         decoded = json.loads(spectrum_str)
@@ -296,11 +299,6 @@ def _valid_positive(proposal, name):
     return proposal["value"]
 
 
-def valid_adduct(value):
-    adducts = matchms.importing.load_adducts_dict()
-    return matchms.utils.looks_like_adduct(value) and value in adducts
-
-
 def is_readable_file(value):
     return os.path.isfile(value) and os.access(value, os.R_OK)
 
@@ -342,112 +340,133 @@ class MsmsRef(HasTraits):
     def __repr__(self) -> str:
         not_na_values = []
         for k in REFS_TYPES:
-            value = getattr(self, k) if self.trait_has_value(k) else ''
-            not_na_values.append('' if value is None else str(value))
-        return ';'.join(not_na_values)
+            value = getattr(self, k) if self.trait_has_value(k) else ""
+            not_na_values.append("" if value is None else str(value))
+        return ";".join(not_na_values)
 
     def __str__(self) -> str:
         return self.__repr__()
 
     @validate("database")
-    def _valid_database(self, proposal):
+    def _valid_database(self, proposal: Proposal) -> str:
         """valid if database string has positive length"""
         return _valid_not_len_zero(proposal, "database")
 
     @validate("id")
-    def _valid_id(self, proposal):
+    def _valid_id(self, proposal: Proposal) -> str:
         """valid if id string has positive length"""
         return _valid_not_len_zero(proposal, "id")
 
     @validate("name")
-    def _valid_name(self, proposal):
+    def _valid_name(self, proposal: Proposal) -> str:
         """valid if name string has positive length"""
         return _valid_not_len_zero(proposal, "name")
 
     @validate("decimal")
-    def _valid_decimal(self, proposal):
+    def _valid_decimal(self, proposal: Proposal) -> int:
         """valid if decimal is positive"""
         return _valid_positive(proposal, "decimal")
 
     @validate("precursor_mz")
-    def _valid_precursor_mz(self, proposal):
+    def _valid_precursor_mz(self, proposal: Proposal) -> float:
         """valid if precursor_mz is positive"""
         return _valid_positive(proposal, "precursor_mz")
 
     @validate("polarity")
-    def _valid_polarity(self, proposal):
+    def _valid_polarity(self, proposal: Proposal) -> str:
         """valid if polarity is in POLARITIES"""
         return _valid_enum(proposal, "polarity", POLARITIES)
 
     @validate("adduct")
-    def _valid_adduct(self, proposal):
+    def _valid_adduct(self, proposal: Proposal) -> str:
         """valid if adduct string has positive length"""
         return _valid_not_len_zero(proposal, "adduct")
 
     @validate("fragmentation_method")
-    def _valid_fragmentation_method(self, proposal):
+    def _valid_fragmentation_method(self, proposal: Proposal) -> str:
         """valid if fragmentation_method in FRAG_METHODS"""
         return _valid_enum(proposal, "fragmentation_method", FRAG_METHODS)
 
     @validate("collision_energy")
-    def _valid_collision_energy(self, proposal):
+    def _valid_collision_energy(self, proposal: Proposal) -> str:
         """valid if collision_energy has positive length"""
         return _valid_not_len_zero(proposal, "collision_energy")
 
     @validate("instrument")
-    def _valid_instrument(self, proposal):
+    def _valid_instrument(self, proposal: Proposal) -> str:
         """valid if instrument has positive length"""
         return _valid_not_len_zero(proposal, "instrument")
 
     @validate("instrument_type")
-    def _valid_instrument_type(self, proposal):
+    def _valid_instrument_type(self, proposal: Proposal) -> str:
         """valid if instrument_type is in INSTRUMENT_TYPES"""
         return _valid_enum(proposal, "instrument_type", INSTRUMENT_TYPES)
 
     @validate("formula")
-    def _valid_formula(self, proposal):
+    def _valid_formula(self, proposal: Proposal) -> str:
         """valid if formula has positive length"""
         return _valid_not_len_zero(proposal, "formula")
 
     @validate("exact_mass")
-    def _valid_exact_mass(self, proposal):
+    def _valid_exact_mass(self, proposal: Proposal) -> float:
         """valid if exact_mass is positive"""
         return _valid_positive(proposal, "exact_mass")
 
     @validate("inchi_key")
-    def _valid_inchi_key(self, proposal):
+    def _valid_inchi_key(self, proposal: Proposal) -> str:
         """valid if inchi_key has positive length"""
         return _valid_not_len_zero(proposal, "inchi_key")
 
     @validate("inchi")
-    def _valid_inchi(self, proposal):
+    def _valid_inchi(self, proposal: Proposal) -> str:
         """valid if inchi matches with inchi_key"""
-        if not is_inchi(proposal["value"]):
+        new_value = str(proposal["value"])
+        if not matchms.utils.is_valid_inchi(new_value):
             raise TraitError("not valid inchi")
-        if not is_valid_inchi_pair(proposal["value"], self.inchi_key):
+        if not cheminfo.is_valid_inchi_pair(new_value, self.inchi_key):
             raise TraitError("inchi and inchi_key do not represent the same molecule")
-        return proposal["value"]
+        if not cheminfo.is_valid_inchi_smiles_pair(new_value, self.smiles):
+            raise TraitError("inchi and smiles do not represent the same molecule")
+        return new_value
 
     @validate("smiles")
-    def _valid_smiles(self, proposal):
+    def _valid_smiles(self, proposal: Proposal) -> str:
         """valid if smiles matches with inchi"""
-        if not is_valid_inchi_smiles_pair(self.inchi, proposal["value"]):
+        new_value = str(proposal["value"])
+        if not matchms.utils.is_valid_smiles(new_value):
+            raise TraitError("Invalid smiles values")
+        if not cheminfo.is_valid_inchi_smiles_pair(self.inchi, new_value):
             raise TraitError("inchi and smiles do not represent the same molecule")
-        return proposal["value"]
+        return new_value
+
+    @traitlets.default("formula")
+    def _get_default_formula(self) -> Optional[str]:
+        """generate formula from inchi"""
+        mol = Chem.MolFromSmiles(self.smiles)
+        if mol is None:
+            return mol
+        return Chem.rdMolDescriptors.CalcMolFormula(mol)
 
     @traitlets.default("smiles")
-    def _get_default_smiles(self):
+    def _get_default_smiles(self) -> Optional[str]:
         """generate smiles from inchi"""
-        if self.inchi is not None and self.inchi != '':
+        if self.inchi is not None and self.inchi != "":
             return Chem.MolToSmiles(Chem.inchi.MolFromInchi(self.inchi))
         return None
 
+    @traitlets.default("inchi")
+    def _get_default_inchi(self) -> Optional[str]:
+        """generate inchi from smiles"""
+        if self.smiles is not None and self.smiles != "":
+            return Chem.inchi.MolToInchi(Chem.MolFromSmiles(self.smiles))
+        return None
+
     def has_missing_fields(self) -> bool:
         """Returns True if there are fields with None values, logs an error message for each field missing"""
         out = False
         for name in REFS_TYPES:
             value = getattr(self, name, None)
-            if value is None or value == '':
+            if value is None or value == "":
                 out = True
                 logger.error("No '%s' field in %s", name, str(self))
         return out
@@ -462,15 +481,19 @@ def is_bad(self) -> bool:
         if self.fragmentation_method not in FRAG_METHODS:
             logger.error('Invalid fragmentation method "%s" for %s.', self.fragmentation_method, self.name)
             bad = True
-        if not is_valid_inchi_pair(self.inchi, self.inchi_key):
+        if not cheminfo.is_valid_inchi_pair(self.inchi, self.inchi_key):
             logger.error("Invalid inchi/inchi_key pair for %s.", self.name)
             bad = True
-        if not is_valid_inchi_smiles_pair(self.inchi, self.smiles):
+        if not cheminfo.is_valid_inchi_smiles_pair(self.inchi, self.smiles):
             logger.error("Invalid inchi/smiles pair for %s.", self.name)
             bad = True
         results = metob.retrieve("compounds", username="*", inchi_key=self.inchi_key)
         if len(results) == 0:
-            logger.warning("Could not find inchi_key=%s in database (name=%s), so skipping some tests.", self.inchi_key, self.name)
+            logger.warning(
+                "Could not find inchi_key=%s in database (name=%s), so skipping some tests.",
+                self.inchi_key,
+                self.name,
+            )
             return bad
         ref_compound = results[0]
         if self.formula != ref_compound.formula:
@@ -489,8 +512,12 @@ def is_bad(self) -> bool:
                 ref_compound.mono_isotopic_molecular_weight,
             )
             bad = True
-        if not is_synonym(self.name, ref_compound.synonyms):
-            logger.error("The entry with inchi_key=%s does not contain name '%s' in database.", self.inchi_key, self.name)
+        if not cheminfo.is_synonym(self.name, ref_compound.synonyms):
+            logger.error(
+                "The entry with inchi_key=%s does not contain name '%s' in database.",
+                self.inchi_key,
+                self.name,
+            )
             bad = True
         return bad
 
@@ -508,96 +535,185 @@ def get_empty_refs() -> pd.DataFrame:
 
 
 def df_row_to_ref(data: dict) -> MsmsRef:
-    """ converts a row from df.to_dict(orient='records') to a MsmsRef instance"""
+    """converts a row from df.to_dict(orient='records') to a MsmsRef instance"""
     data_minus_na = {k: v for k, v in data.items() if pd.notna(v)}
-    if 'spectrum' in data_minus_na:
-        data_minus_na['spectrum'] = str_to_spectrum(data_minus_na['spectrum'])
+    if "spectrum" in data_minus_na:
+        data_minus_na["spectrum"] = str_to_spectrum(data_minus_na["spectrum"])
     return MsmsRef(**data_minus_na)
 
 
 def get_num_bad_refs(refs_df: pd.DataFrame) -> int:
     """Return number of rows that fail validation in refs_df. Info on failures to logger"" """
-    return sum([0] + [0 if df_row_to_ref(row).is_bad() else 1 for row in refs_df.to_dict(orient='records')])
-
-
-def in_rt_mz_ranges(rt, rt_min, rt_max, mz, mz_target, mz_tol):
-    return dp.within_tolerance(mz, mz_target, mz_tol) and (rt_min <= rt <= rt_max)
-
-
-def extract_most_intense(in_df, rt_min, rt_max, mz_target, mz_tol):
-    group_cols = ['rt', 'polarity', 'precursor_MZ', 'precursor_intensity', 'collision_energy']
-    in_tol_df = in_df.groupby(group_cols).filter(lambda x: in_rt_mz_ranges(x.iloc[0]['rt'], rt_min, rt_max, x.iloc[0]['precursor_MZ'], mz_target, mz_tol))
-    precursor_intensity_max = in_tol_df['precursor_intensity'].max()
-    most_intense_df = in_tol_df.groupby(group_cols).filter(lambda x: precursor_intensity_max == x.iloc[0]['precursor_intensity'])
-    spectrum = Spectrum(tuple(most_intense_df['mz']), tuple(most_intense_df['i']))
-    most_intense = most_intense_df.iloc[0]
-    return (spectrum, most_intense['rt'], most_intense['precursor_MZ'], most_intense['collision_energy'])
-
-
-def extract_spectrum(h5_file_name, molecule_id, adduct, rt_min, rt_max, mz_tolerance) -> Spectrum:
-    if matchms.utils.is_valid_inchi(molecule_id):
-        mol = Chem.MolFromInchi(molecule_id)
-    elif matchms.utils.is_valid_smiles(molecule_id):
-        mol = Chem.MolFromSmiles(molecule_id)
-    else:
-        raise ValueError(f"molecule_id '{molecule_id}' is not a valid inchi or smiles string")
-    h5_df = ma_data.df_container_from_metatlas_file(h5_file_name)
-    parent_mass = Chem.Descriptors.ExactMolWt(mol)
-    precursor_mz = cheminfo.get_precursor_mz(parent_mass, adduct)
-    return extract_most_intense(h5_df, rt_min, rt_max, precursor_mz, mz_tolerance)
-
-
-def sheet_row_to_spectrum(input_sheet, input_defs, row_num) -> Spectrum:
-    row_dict = row_list_to_dict(input_sheet.cells[0].value[row_num], input_defs)
-    return extract_spectrum(row_dict["h5_file_name"], row_dict["inchi"], row_dict["adduct"], float(row_dict["rt_min"]), float(row_dict["rt_max"]), float(row_dict["mz_tolerance"]))
-
-
-def row_col_to_cell_num(in_sheet: ipysheet.sheet, row_num: int, col_num: int) -> int:
-    return in_sheet.columns * row_num + col_num
-
-
-def row_list_to_dict(values: List[Any], input_defs: List[Input]) -> Dict[str, Any]:
-    return dict(zip([x.identifier for x in input_defs], values))
+    return sum([0] + [1 if df_row_to_ref(row).is_bad() else 0 for row in refs_df.to_dict(orient="records")])
 
 
-def get_invalid_cells(input_sheet: ipysheet.sheet, input_defs: List[Input]) -> List[Tuple[int, str]]:
+def get_invalid_cells(sheet: ipysheet.sheet, input_defs: List[Input]) -> List[Tuple[int, str]]:
     bad_cells = []
-    for row_num, values in enumerate(input_sheet.cells[0].value):
-        row_dict = row_list_to_dict(input_defs, values)
-        for column_num, current_def in enumerate(input_defs):
-            try:
-                is_good = current_def.validators(row_dict)
-            except Exception:
-                is_good = False
+    for row_num, values in enumerate(sheet.cells[0].value):
+        row_dict = row_list_to_dict(values, input_defs)
+        for current_def in input_defs:
+            is_good = current_def.validator(row_dict)
             if not is_good:
                 bad_cells.append((row_num, current_def.label))
     return bad_cells
 
 
-def spectrums_from_sheet(input_sheet):
-    for row_num in range(input_sheet.rows):
-        sheet_row_to_spectrum(input_sheet, INPUTS, row_num)
-    pass
+def row_list_to_dict(values: List[str], input_defs: List[Input]) -> InputDict:
+    return to_input_dict(
+        {x.identifier: v if x.basic_type != "numeric" else to_float(v) for x, v in zip(input_defs, values)}
+    )
 
 
-def display_inputs_ui(num_rows: int) -> widgets.Box:
+def in_rt_mz_ranges(
+    rtime: float, rt_min: float, rt_max: float, m_z: float, mz_target: float, mz_tol: float
+) -> bool:
+    """
+    Inputs:
+        rt: measure retention time in minutes
+        rt_min: lower bound of passing RT values
+        rt_max: upper bound of passing RT values
+        m_z: measured mass-charge ratio
+        mz_target: passing within mz_tol of this value
+        mz_tol: mz_tolerance in units of ppm
+    """
+    return dp.within_tolerance(m_z, mz_target, mz_tol * 1e-6) and (rt_min <= rtime <= rt_max)
+
+
+def refs_list_to_df(refs: List[MsmsRef]) -> pd.DataFrame:
+    data: Dict[str, List[Any]] = {k: [] for k in REFS_TYPES}
+    for ref in refs:
+        for key in REFS_TYPES:
+            data[key].append(getattr(ref, key))
+    return pd.DataFrame(data=data)
+
+
+def extract_most_intense(
+    h5_file_name: str, molecule_id: str, adduct: str, rt_min: float, rt_max: float, mz_tol: float
+) -> Tuple[Spectrum, float, float, float, str]:
+    """
+    Inputs:
+        molecule_id: either inchi or smiles string
+        mz_tol: mz_tolerance in units of ppm
+    Returns Spectrum, RT, parent_mass, precursor_mz, collision_energy
+    """
+    mol = cheminfo.inchi_or_smiles_to_molecule(molecule_id)
+    parent_mass = ExactMolWt(mol)
+    precursor_mz = cheminfo.get_precursor_mz(parent_mass, adduct)
+    h5_data = ma_data.df_container_from_metatlas_file(h5_file_name)
+    msms_df = h5_data["ms2_pos"] if cheminfo.is_positive_mode(adduct) else h5_data["ms2_neg"]
+    in_tol_df = msms_df.groupby(GROUP_SPECTRUM_COLS).filter(
+        lambda x: in_rt_mz_ranges(
+            x.iloc[0]["rt"], rt_min, rt_max, x.iloc[0]["precursor_MZ"], precursor_mz, mz_tol
+        )
+    )
+    precursor_intensity_max = in_tol_df["precursor_intensity"].max()
+    most_intense_df = in_tol_df.groupby(GROUP_SPECTRUM_COLS).filter(
+        lambda x: precursor_intensity_max == x.iloc[0]["precursor_intensity"]
+    )
+    most_intense = most_intense_df.iloc[0]
+    return (
+        Spectrum(tuple(most_intense_df["mz"]), tuple(most_intense_df["i"])),
+        most_intense["rt"],
+        parent_mass,
+        float(most_intense["precursor_MZ"]),
+        f"{most_intense['collision_energy']:.1f}eV",
+    )
+
+
+def build_msms_ref(in_dict: InputDict) -> MsmsRef:
+    """MsmsRef factory"""
+    ref_keys = MsmsRef().class_trait_names()
+    ref_dict = {k: v for k, v in in_dict.items() if k in ref_keys}
+    try:
+        (
+            ref_dict["spectrum"],
+            _,
+            ref_dict["exact_mass"],
+            ref_dict["precursor_mz"],
+            ref_dict["collision_energy"],
+        ) = extract_most_intense(
+            in_dict["h5_file_name"],
+            in_dict["molecule_id"],
+            in_dict["adduct"],
+            in_dict["rt_min"],
+            in_dict["rt_max"],
+            in_dict["mz_tolerance"],
+        )
+    except IndexError as err:
+        logger.error(f"Matching spectrum not found for {in_dict['name']}")
+        raise err
+    ref_dict["polarity"] = (
+        "positive" if cheminfo.is_positive_mode(str(ref_dict["adduct"])) else "negative"
+    )
+    mol = cheminfo.normalize_molecule(cheminfo.inchi_or_smiles_to_molecule(in_dict["molecule_id"]))
+    ref_dict["inchi"] = Chem.inchi.MolToInchi(mol)
+    ref_dict["smiles"] = Chem.MolToSmiles(mol)
+    ref_dict["inchi_key"] = Chem.inchi.InchiToInchiKey(ref_dict["inchi"])
+    ref_dict["database"] = NEW_REFS_DB_NAME
+    ref_dict["id"] = str(uuid.uuid4())
+    return MsmsRef(**ref_dict)
+
+
+def generate_msms_refs(
+    existing_refs_file_name: Optional[str],
+    output_file_name: str,
+    sheet: ipysheet.sheet,
+    validate_existing: bool = False,
+) -> None:
+    """Create CSV file containing old and new MSMS refs"""
+    refs_df = get_empty_refs() if existing_refs_file_name is None else read_msms_refs(existing_refs_file_name)
+    logger.info(
+        "Number of existing msms reference records not passing validation is %d", get_num_bad_refs(refs_df)
+    )
+    if validate_existing and get_num_bad_refs(refs_df) > 0:
+        logger.error("All existing MSMS references must pass validation before spectrum extraction")
+        return
+    new_refs = [build_msms_ref(row_list_to_dict(row, INPUTS)) for row in sheet.cells[0].value]
+    new_df = refs_list_to_df(new_refs)
+    out_df = pd.concat([refs_df, new_df])
+    out_df.to_csv(output_file_name, sep="\t", index=False)
+    logger.info("New MSMS references file with %d records written to %s.", len(out_df), output_file_name)
+
+
+def display_inputs_ui(
+    existing_refs_file_name: Optional[str], output_file_name: str, validate_existing: bool, num_rows: int
+) -> widgets.Box:
     """Display spreadsheet for entering input values"""
+    if existing_refs_file_name is not None and not is_readable_file(existing_refs_file_name):
+        logger.error("%s does not exist or is not readable.", existing_refs_file_name)
+        return widgets.Box()
     col_headers = [x.label for x in INPUTS]
-    input_sheet = ipysheet.sheet(rows=num_rows, columns=len(INPUTS), column_headers=col_headers, column_width=COLUMN_WIDTH, column_resizing=False)
-    ipysheet.easy.cell_range([['']*len(INPUTS)]*num_rows)
-    extract = widgets.Button(description="Extract Spectrums")
+    sheet = ipysheet.sheet(
+        rows=num_rows,
+        columns=len(INPUTS),
+        column_headers=col_headers,
+        column_resizing=False,
+        column_width=COLUMN_WIDTH,
+    )
+    ipysheet.easy.cell_range([[""] * len(INPUTS)] * num_rows)
+    extract = widgets.Button(description="Execute")
+    docs = widgets.Button(description="Help")
     log_box = widgets.Output()
 
     def on_extract_clicked(_):
+        """launch msms refs extraction and export"""
         log_box.clear_output()
-        invalid = get_invalid_cells(input_sheet, [x.validator for x in INPUTS])
         with log_box:
+            invalid = get_invalid_cells(sheet, INPUTS)
             for row_num, col_name in invalid:
-                logger.error("In row %d, invalid value for '%s'.", row_num+1, col_name)
+                logger.error("In row %d, invalid value for '%s'.", row_num + 1, col_name)
             if len(invalid) > 0:
                 logger.error("All inputs must pass validation before spectrum extraction")
                 return
-        spectrums_from_sheet(input_sheet)
+            generate_msms_refs(existing_refs_file_name, output_file_name, sheet, validate_existing)
 
     extract.on_click(on_extract_clicked)
-    return widgets.VBox([input_sheet, extract, log_box])
+
+    def on_docs_clicked(_):
+        """show help information below the sheet widget"""
+        log_box.clear_output()
+        with log_box:
+            print(HELP_TEXT)
+
+    docs.on_click(on_docs_clicked)
+    return widgets.VBox([sheet, widgets.HBox([extract, docs]), log_box])
diff --git a/metatlas/tools/cheminfo.py b/metatlas/tools/cheminfo.py
index 5f1fc5ba..26f29ebe 100644
--- a/metatlas/tools/cheminfo.py
+++ b/metatlas/tools/cheminfo.py
@@ -1,16 +1,24 @@
-"""cheminformatics related functions"""
+""" cheminformatics related functions """
+
+import logging
 
 import matchms
 import numpy as np
 
+from rdkit import Chem
+
+from metatlas.interfaces.compounds import structure_cleaning as cleaning
+
+logger = logging.getLogger(__name__)
+
 
 def get_parent_mass(precursor_mz: float, adduct: str) -> float:
     """Returns the mass of the input molecule that would result in the supplied precursor_mz and adduct"""
-    dummy = matchms.Spectrum(mz=np.array([]),
-                             intensities=np.array([]),
-                             metadata={"precursor_mz": precursor_mz, "adduct": adduct})
+    dummy = matchms.Spectrum(
+        mz=np.array([]), intensities=np.array([]), metadata={"precursor_mz": precursor_mz, "adduct": adduct}
+    )
     updated = matchms.filtering.add_parent_mass(dummy)
-    return updated.metadata['parent_mass']
+    return updated.metadata["parent_mass"]
 
 
 def get_precursor_mz(parent_mass: float, adduct: str) -> float:
@@ -21,3 +29,101 @@ def get_precursor_mz(parent_mass: float, adduct: str) -> float:
     multiplier = adducts[adduct]["mass_multiplier"]
     correction_mass = adducts[adduct]["correction_mass"]
     return (parent_mass + correction_mass) / multiplier
+
+
+def is_positive_mode(adduct: str) -> bool:
+    """Returns True if the MS mode for an adduct is positive"""
+    adducts = matchms.importing.load_adducts_dict()
+    if adduct not in adducts:
+        raise KeyError("Adduct '%s' is not supported")
+    return adducts[adduct]["ionmode"] == "positive"
+
+
+def is_valid_inchi_pair(test_inchi: str, test_inchi_key: str) -> bool:
+    """True if if test_inchi has the inchi key test_inchi_key"""
+    if not matchms.utils.is_valid_inchi(test_inchi):
+        return False
+    return test_inchi_key == Chem.inchi.InchiToInchiKey(test_inchi)
+
+
+def is_valid_inchi_smiles_pair(test_inchi: str, test_smiles: str) -> bool:
+    """
+    True if test_inchi and test_smiles have the same structure.
+    """
+    mol_from_inchi = Chem.inchi.MolFromInchi(test_inchi)
+    if mol_from_inchi is None:
+        return False
+    mol_from_smiles = Chem.MolFromSmiles(test_smiles)
+    if mol_from_smiles is None:
+        return False
+    return are_equal(mol_from_inchi, mol_from_smiles)
+
+
+def inchi_to_smiles(inchi: str) -> str:
+    """Convert Inchi to smiles"""
+    out = Chem.MolToSmiles(Chem.inchi.MolFromInchi(inchi))
+    if out is None:
+        raise ValueError(f"'{inchi}' is not a valid Inchi")
+    return out
+
+
+def smiles_to_inchi(smiles: str) -> str:
+    """Convert smiles to Inchi"""
+    out = Chem.inchi.MolFromInchi(Chem.MolFromSmiles(smiles))
+    if out is None:
+        raise ValueError(f"'{smiles}' is not a valid smiles")
+    return out
+
+
+def inchi_or_smiles_to_molecule(molecule_id: str) -> Chem.rdchem.Mol:
+    """Convert Inchi or smiles to rdkit Mol"""
+    out = Chem.inchi.MolFromInchi(molecule_id) or Chem.MolFromSmiles(molecule_id)
+    if out is None:
+        raise ValueError(f"'{molecule_id}' is not a valid Inchi or smiles")
+    return out
+
+
+def inchi_or_smiles_to_inchi(molecule_id: str) -> str:
+    """Inchi or smiles string to smiles string"""
+    out = Chem.inchi.MolToInchi(inchi_or_smiles_to_molecule(molecule_id))
+    if out is None:
+        raise ValueError(f"'{molecule_id}' is not a valid Inchi or smiles")
+    return out
+
+
+def inchi_or_smiles_to_smiles(molecule_id: str) -> str:
+    """Inchi or smiles string to smiles string"""
+    out = Chem.MolToSmiles(inchi_or_smiles_to_molecule(molecule_id))
+    if out is None:
+        raise ValueError(f"'{molecule_id}' is not a valid Inchi or smiles")
+    return out
+
+
+def normalize_molecule(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
+    """Removes salt and neutralizes charges"""
+    desalted, _ = cleaning.desalt(mol)
+    return cleaning.NeutraliseCharges(desalted)[0]
+
+
+def are_equal(molecule1: Chem.rdchem.Mol, molecule2: Chem.rdchem.Mol) -> bool:
+    """True if both molecules are substructures of each other"""
+    return molecule1.HasSubstructMatch(molecule2) and molecule2.HasSubstructMatch(molecule1)
+
+
+def is_synonym(name: str, synonym_string: str) -> bool:
+    """
+    Inputs:
+        name: string to check for within synonym_string
+        synonym_string: string with /// between names
+    Returns True if case insensitive match of name to full name in synonym_string
+    """
+    return name.lower() in [x.lower() for x in synonym_string.split("///")]
+
+
+def valid_adduct(value: str) -> bool:
+    """
+    True if the value is an adduct listed supported by the matchms package
+    This is not a comprehensive list, so it will return False for some uncommon adducts
+    """
+    adducts = matchms.importing.load_adducts_dict()
+    return value in adducts
diff --git a/notebooks/reference/Add_MSMS_Reference.ipynb b/notebooks/reference/Add_MSMS_Reference.ipynb
index 4b6cf06a..f2fafcc4 100644
--- a/notebooks/reference/Add_MSMS_Reference.ipynb
+++ b/notebooks/reference/Add_MSMS_Reference.ipynb
@@ -1,18 +1,8 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d0440386-4864-4a26-a70d-910b4c1e99d4",
-   "metadata": {},
-   "source": [
-    "# Parameters\n",
-    "\n",
-    "The next code block sets parameters that are used throughout the remainder of the notebook."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4a07e86e-ff53-4cf6-8f61-fad398f9441c",
    "metadata": {
     "tags": [
@@ -30,10 +20,11 @@
     "metatlas_repo_path = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas\"\n",
     "\n",
     "# absolute path of the input file that contains the msms refs to want to append to\n",
+    "# or None to generate an msms refs file that only contains new entries\n",
     "input_file_name = \"/global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab\"\n",
     "\n",
     "# should the data validation be run on the input file?\n",
-    "# if True, you won't be able to export a file if input_file_name points to a file with any bad rows\n",
+    "# if True, you won't be able to export a file if input_file_name has any rows not passing validation\n",
     "validate_input_file = False\n",
     "\n",
     "# how many spectrum to import\n",
@@ -42,10 +33,6 @@
     "# absolute path of where you want this notebook to store the new MSMS refs\n",
     "output_file_name = \"/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects/my_msms_refs.tab\"\n",
     "\n",
-    "# maximum number of CPUs to use\n",
-    "# when running on jupyter.nersc.gov, you are not allowed to set this above 4\n",
-    "max_cpus = 4\n",
-    "\n",
     "# Threshold for how much status information metatlas functions print in the notebook\n",
     "# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'\n",
     "log_level = \"INFO\""
@@ -53,64 +40,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "2106bcab-ecfa-47cb-91e3-682ad57bd4ee",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "metatlas_repo_path = '/src'\n",
-    "input_file_name = \"/work/short_refs.tab\"\n",
-    "output_file_name = \"/work/updated_refs.tab\"\n",
-    "num_rows_to_add = 2"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "b56b1e89-bd9c-4c11-896f-576c14a99c20",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2021-10-01 03:50:15 \u001b[32mINFO    \u001b[0m Running on git commit: b'a8111d3ca4d874cb29e6b97b4a164e078363ad8f'\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<style>.container { width:100% !important; }</style>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2021-10-01 03:50:15 \u001b[32mINFO    \u001b[0m NERSC=False\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "27a4ea30b18d4fc3bb2d926b34a7bafa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(Sheet(cells=(Cell(column_end=9, column_start=0, row_end=1, row_start=0, squeeze_column=False, s…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# pylint: disable=wrong-import-position,import-error\n",
     "import logging  # noqa: E402\n",
@@ -145,25 +86,11 @@
     "    logging.critical(\"A newer version of metatlas_repo is required to use this notebook.\")\n",
     "    raise ImportError from err\n",
     "from metatlas.tools import notebook  # noqa: E402\n",
-    "\n",
-    "\n",
-    "notebook.setup(log_level)\n",
     "from metatlas.tools import add_msms_ref as amr  # noqa: E402\n",
     "\n",
-    "\n",
-    "import os\n",
-    "os.chdir('/work')\n",
-    "\n",
-    "amr.display_inputs_ui(num_rows_to_add)"
+    "notebook.setup(log_level)\n",
+    "amr.display_inputs_ui(input_file_name, output_file_name, validate_input_file, num_rows_to_add)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b08bbc83-e9cb-495b-9689-184e5b8e57c1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/noxfile.py b/noxfile.py
index cbcb8daa..51cb09a2 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -26,6 +26,7 @@
     "metatlas/io/write_utils.py",
     "metatlas/datastructures/metatlas_dataset.py",
     "metatlas/tools/add_msms_ref.py",
+    "metatlas/tools/cheminfo.py",
     "metatlas/tools/environment.py",
     "metatlas/tools/logging.py",
     "metatlas/tools/notebook.py",
diff --git a/pyproject.toml b/pyproject.toml
index 4417e2cc..a9c3d60a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ module = [
     "IPython.*",
     "ipywidgets.*",
     "labkey.*",
+    "matchms.*",
     "matplotlib.*",
     "metatlas.h5_query.*",
     "metatlas.helpers.*",
diff --git a/tests/system/test_add_msms_ref.py b/tests/system/test_add_msms_ref.py
new file mode 100644
index 00000000..fff24110
--- /dev/null
+++ b/tests/system/test_add_msms_ref.py
@@ -0,0 +1,34 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long, duplicate-code
+
+from . import utils
+
+
+def test_add_msms_ref_by_line01(tmp_path):
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.4"
+    expected = {}
+    expected[
+        str(tmp_path / "post_sed.tab")
+    ] = """database	id	name	spectrum	decimal	precursor_mz	polarity	adduct	fragmentation_method	collision_energy	instrument	instrument_type	formula	exact_mass	inchi_key	inchi\tsmiles
+metatlas	8257da2871be46cfabe10827aaf8d288	L-threonic acid	[[55.7172, 56.1914, 58.0045, 59.0093, 59.0124, 71.0124, 72.9917, 73.0329, 75.0073, 76.0107, 87.0073, 87.212, 89.023, 90.2418, 117.018, 135.029, 135.125, 135.148, 136.032], [6911.62, 9049.5, 11019.7, 7863.62, 293886.0, 204550.0, 763526.0, 7819.21, 4151310.0, 42595.5, 47822.2, 8750.14, 448975.0, 8417.63, 118314.0, 1445940.0, 8032.76, 9857.3, 38018.3]]	4	135.03	negative						C4H8O5	136.037173356	JPIJQSOTBSSVTP-STHAYSLISA-N	InChI=1S/C4H8O5/c5-1-2(6)3(7)4(8)9/h2-3,5-7H,1H2,(H,8,9)/t2-,3+/m0/s1
+NorthernLabAddition:NoDB	REPLACED_UUID	adenosine	[[55.01269, 57.02821, 66.78931, 69.02660, 71.00568, 73.02122, 78.05609, 80.87838, 85.02310, 87.03920, 89.87952, 89.94361, 104.36960, 107.86694, 115.03886, 119.03568, 133.04977, 136.06194, 170.63907, 268.10406], [186032.312, 313596.656, 99986.633, 100581.734, 145571.344, 267841.375, 126804.180, 109123.148, 375497.844, 116243.438, 103804.906, 103371.594, 139217.422, 110503.422, 296771.406, 123998.859, 488822.156, 54508756.000, 102044.547, 18234916.000]]\t4	268.1037292480469	positive	[M+H]+	CID	23.3eV	ThermoQTOF-3000	Orbitrap	C10H13N5O4	267.096753896	OIRDTQYFTABQOQ-KQYNXXCUSA-N	InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1	Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O"""
+    command = """head -2 /global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab > /out/short_refs.tab && \
+                    jq -M '(.cells[] | select(.source[] | contains("display_inputs_ui")).source) \
+                                += ["\n",
+                                    "import ipysheet\n",
+                                    "sheet = ipysheet.current()\n",
+                                    "input_list = [\\"adenosine\\",\\"InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1\\",\\"[M+H]+\\",\\"ThermoQTOF-3000\\", \\"Orbitrap\\", \\"CID\\", \\"12\\", \\"2.9\\", \\"3.3\\", \\"/project/projectdirs/metatlas/raw_data/akuftin/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583/20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\\"]\n",
+                                    "sheet.cells[0].value[0] = input_list\n",
+                                    "amr.generate_msms_refs(input_file_name, output_file_name, sheet, validate_input_file)"]' \
+                                /src/notebooks/reference/Add_MSMS_Reference.ipynb > /out/Remove.ipynb &&  \
+                    papermill \
+                        -p metatlas_repo_path /src \
+                        -p input_file_name /out/short_refs.tab \
+                        -p output_file_name /out/updated_refs.tab \
+                        -p num_rows_to_add 1 \
+                        /out/Remove.ipynb \
+                        /out/Remove-done.ipynb && \
+                        sed 's%^NorthernLabAddition:NoDB\t[a-z0-9-]*%NorthernLabAddition:NoDB\tREPLACED_UUID%' < /out/updated_refs.tab > /out/post_sed.tab
+                   """
+    utils.exec_docker(image, command, tmp_path)
+    assert utils.num_files_in(tmp_path) == 5
+    utils.assert_files_match(expected)
diff --git a/tests/unit/test_add_msms_ref.py b/tests/unit/test_add_msms_ref.py
index 6311cef8..17c4f6e1 100644
--- a/tests/unit/test_add_msms_ref.py
+++ b/tests/unit/test_add_msms_ref.py
@@ -1,12 +1,9 @@
 """ unit tests for add_msms_refs module """
 # pylint: disable=missing-function-docstring,line-too-long
 
-import json
 import pytest
 import traitlets
 
-from rdkit import Chem
-
 from metatlas.tools import add_msms_ref
 
 INCHI = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"
@@ -24,7 +21,7 @@ def tests_msms_ref01(mocker, compound):
         precursor_mz=251.101839276,
         polarity="negative",
         adduct="[M-H]+",
-        fragmentation_method="cid",
+        fragmentation_method="CID",
         collision_energy="60eV",
         instrument="ThermoTOF-3000",
         instrument_type="LC-ESI-QTOF",
@@ -46,7 +43,7 @@ def tests_msms_ref02(mocker, compound):
             precursor_mz=251.101839276,
             polarity="negative",
             adduct="[M-H]+",
-            fragmentation_method="cid",
+            fragmentation_method="CID",
             collision_energy="60eV",
             instrument="ThermoTOF-3000",
             instrument_type="LC-ESI-QTOF",
@@ -57,46 +54,6 @@ def tests_msms_ref02(mocker, compound):
         )
 
 
-def test_is_inchi01():
-    assert add_msms_ref.is_inchi(INCHI)
-    assert not add_msms_ref.is_inchi("f{INCHI}BLAH")
-    assert not add_msms_ref.is_inchi("")
-    assert not add_msms_ref.is_inchi("InChI=")
-
-
-def test_is_valid_inchi_pair():
-    assert add_msms_ref.is_valid_inchi_pair(INCHI, INCHI_KEY)
-    assert not add_msms_ref.is_valid_inchi_pair("", INCHI_KEY)
-    assert not add_msms_ref.is_valid_inchi_pair(INCHI, "")
-    assert not add_msms_ref.is_valid_inchi_pair(f"{INCHI}foobar!", INCHI_KEY)
-    assert not add_msms_ref.is_valid_inchi_pair(INCHI, f"{INCHI_KEY}foobar!")
-
-
-def test_is_valid_inchi_smiles_pair():
-    assert add_msms_ref.is_valid_inchi_smiles_pair(INCHI, SMILES)
-    assert not add_msms_ref.is_valid_inchi_smiles_pair("", SMILES)
-    assert not add_msms_ref.is_valid_inchi_smiles_pair(INCHI, "")
-    assert not add_msms_ref.is_valid_inchi_smiles_pair(f"{INCHI}foobar!", SMILES)
-    assert not add_msms_ref.is_valid_inchi_smiles_pair(INCHI, f"{SMILES}foobar!")
-
-
-def test_are_equal():
-    mol1 = Chem.inchi.MolFromInchi(INCHI)
-    mol2 = Chem.inchi.MolFromInchi("InChI=1S/H2O/h1H2")
-    assert add_msms_ref.are_equal(mol1, mol1)
-    assert add_msms_ref.are_equal(mol2, mol2)
-    assert not add_msms_ref.are_equal(mol1, mol2)
-    assert not add_msms_ref.are_equal(mol2, mol1)
-
-
-def test_is_synonym():
-    assert add_msms_ref.is_synonym("foobar", "FOO///bar///FooZoo///FooBar")
-    assert add_msms_ref.is_synonym("foobar", "FOOBAR")
-    assert add_msms_ref.is_synonym("FooBar", "foobar///bar///FooZoo///FooBeeear")
-    assert not add_msms_ref.is_synonym("foobar", "")
-    assert not add_msms_ref.is_synonym("FooBarz", "foobar///bar///FooZoo///FooBeeear")
-
-
 def test_spectrum01():
     add_msms_ref.Spectrum(intensities=[1.2, 1, 4], mzs=[123, 145, 256.04])
 
@@ -116,12 +73,61 @@ def test_spectrum02():
         add_msms_ref.Spectrum(intensities=[1, 1], mzs=[123, 22])
 
 
-def test_str_to_spectrum():
+def test_str_to_spectrum(caplog):
     spectrum1 = add_msms_ref.str_to_spectrum("[[123.456,145.789],[1.0,2.2]]")
     assert spectrum1.mzs == [123.456, 145.789]
     assert spectrum1.intensities == [1.0, 2.2]
     spectrum2 = add_msms_ref.str_to_spectrum("[ [123.456, 145.789], [1.0, 2.2] ]")
     assert spectrum2.mzs == [123.456, 145.789]
     assert spectrum2.intensities == [1.0, 2.2]
-    with pytest.raises(json.JSONDecodeError):
-        add_msms_ref.str_to_spectrum("foobar")
+    spectrum3 = add_msms_ref.str_to_spectrum("[ [], [] ]")
+    assert spectrum3.mzs == []
+    assert spectrum3.intensities == []
+    spectrum4 = add_msms_ref.str_to_spectrum("foobar")
+    assert spectrum4.mzs == []
+    assert spectrum4.intensities == []
+    assert "Cannot convert 'foobar' to a Spectrum object, setting to empty spectrum" in caplog.text
+
+
+def test_has_missing_fields01():
+    ref = add_msms_ref.MsmsRef(
+        id="abcdefghijklmnop",
+        database="my_db",
+        name="2'-deoxyadenosine",
+        spectrum=add_msms_ref.Spectrum(intensities=[1, 1.4, 2], mzs=[100, 101, 555]),
+        decimal=4,
+        precursor_mz=251.101839276,
+        polarity="negative",
+        adduct="[M-H]+",
+        fragmentation_method="CID",
+        collision_energy="60eV",
+        instrument="ThermoTOF-3000",
+        instrument_type="LC-ESI-QTOF",
+        formula="C10H13N5O3",
+        exact_mass=251.101839276,
+        inchi_key=INCHI_KEY,
+        inchi=INCHI,
+    )
+    assert not ref.has_missing_fields()
+
+
+def test_has_missing_fields02(caplog):
+    ref = add_msms_ref.MsmsRef(
+        database="my_db",
+        name="2'-deoxyadenosine",
+        spectrum=add_msms_ref.Spectrum(intensities=[1, 1.4, 2], mzs=[100, 101, 555]),
+        decimal=4,
+        precursor_mz=251.101839276,
+        polarity="negative",
+        adduct="[M-H]+",
+        fragmentation_method="CID",
+        collision_energy="60eV",
+        instrument="ThermoTOF-3000",
+        instrument_type="LC-ESI-QTOF",
+        formula="C10H13N5O3",
+        exact_mass=251.101839276,
+        inchi_key=INCHI_KEY,
+        inchi=INCHI,
+    )
+    assert ref.has_missing_fields()
+    assert "No 'id' field in" in caplog.text
diff --git a/tests/unit/test_cheminfo.py b/tests/unit/test_cheminfo.py
new file mode 100644
index 00000000..077b0dd8
--- /dev/null
+++ b/tests/unit/test_cheminfo.py
@@ -0,0 +1,53 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring, line-too-long
+
+import matchms
+
+from rdkit import Chem
+
+from metatlas.tools import cheminfo
+
+INCHI = "InChI=1S/C10H13N5O3/c11-9-8-10(13-3-12-9)15(4-14-8)7-1-5(17)6(2-16)18-7/h3-7,16-17H,1-2H2,(H2,11,12,13)/t5-,6+,7+/m0/s1"
+INCHI_KEY = "OLXZPDWKRNYJJZ-RRKCRQDMSA-N"
+SMILES = "C1[C@@H]([C@@H](CO)O[C@H]1n1cnc2c(N)ncnc12)O"
+
+
+def test_get_parent_mass01():
+    adducts = matchms.importing.load_adducts_dict()
+    original_parent = 100
+    for name in adducts:
+        pre = cheminfo.get_precursor_mz(original_parent, name)
+        parent = cheminfo.get_parent_mass(pre, name)
+        assert abs(original_parent - parent) < 1e-7
+
+
+def test_is_valid_inchi_pair():
+    assert cheminfo.is_valid_inchi_pair(INCHI, INCHI_KEY)
+    assert not cheminfo.is_valid_inchi_pair("", INCHI_KEY)
+    assert not cheminfo.is_valid_inchi_pair(INCHI, "")
+    assert not cheminfo.is_valid_inchi_pair(f"{INCHI}foobar!", INCHI_KEY)
+    assert not cheminfo.is_valid_inchi_pair(INCHI, f"{INCHI_KEY}foobar!")
+
+
+def test_is_valid_inchi_smiles_pair():
+    assert cheminfo.is_valid_inchi_smiles_pair(INCHI, SMILES)
+    assert not cheminfo.is_valid_inchi_smiles_pair("", SMILES)
+    assert not cheminfo.is_valid_inchi_smiles_pair(INCHI, "")
+    assert not cheminfo.is_valid_inchi_smiles_pair(f"{INCHI}foobar!", SMILES)
+    assert not cheminfo.is_valid_inchi_smiles_pair(INCHI, f"{SMILES}foobar!")
+
+
+def test_are_equal():
+    mol1 = Chem.inchi.MolFromInchi(INCHI)
+    mol2 = Chem.inchi.MolFromInchi("InChI=1S/H2O/h1H2")
+    assert cheminfo.are_equal(mol1, mol1)
+    assert cheminfo.are_equal(mol2, mol2)
+    assert not cheminfo.are_equal(mol1, mol2)
+    assert not cheminfo.are_equal(mol2, mol1)
+
+
+def test_is_synonym():
+    assert cheminfo.is_synonym("foobar", "FOO///bar///FooZoo///FooBar")
+    assert cheminfo.is_synonym("foobar", "FOOBAR")
+    assert cheminfo.is_synonym("FooBar", "foobar///bar///FooZoo///FooBeeear")
+    assert not cheminfo.is_synonym("foobar", "")
+    assert not cheminfo.is_synonym("FooBarz", "foobar///bar///FooZoo///FooBeeear")

From 372c4291ba165c09cf05eab65dd5bf8190816f78 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 5 Oct 2021 17:17:17 -0700
Subject: [PATCH 168/177] linting fixes, set open()s to utf8

---
 docker/Dockerfile.msms_refs                 | 35 ---------------------
 metatlas/datastructures/metatlas_dataset.py |  6 ++--
 metatlas/io/targeted_output.py              |  4 +--
 metatlas/io/write_utils.py                  |  2 +-
 metatlas/tools/environment.py               |  4 +--
 metatlas/tools/notebook.py                  |  2 +-
 metatlas/tools/predict_rt.py                |  2 +-
 tests/system/utils.py                       |  2 +-
 tests/unit/test_environment.py              | 12 +++----
 9 files changed, 17 insertions(+), 52 deletions(-)
 delete mode 100644 docker/Dockerfile.msms_refs

diff --git a/docker/Dockerfile.msms_refs b/docker/Dockerfile.msms_refs
deleted file mode 100644
index 81bd3f5b..00000000
--- a/docker/Dockerfile.msms_refs
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM python:3.8-slim-bullseye
-
-# https://portal.nersc.gov/cfs/m2650/metatlas/test_data
-# serves from /global/cfs/cdirs/m2650/www/metatlas/test_data
-ARG BASE_DATA_URL=https://portal.nersc.gov/cfs/m2650/metatlas/test_data/ci01
-ARG REFS_DIR=/global/project/projectdirs/metatlas/projects/spectral_libraries
-
-ENV METATLAS_LOCAL=True
-
-EXPOSE 8888
-
-RUN apt-get update && \
-    apt-get install -y \
-        libxrender1 \
-	nodejs \
-	npm && \
-    rm -rf /var/lib/apt/lists/*
-
-
-COPY requirements.txt /requirements.txt
-
-RUN pip install --quiet -r requirements.txt
-
-RUN mkdir -p /io /src /work $REFS_DIR
-
-ADD $BASE_DATA_URL/msms_refs_v3.tab $REFS_DIR/
-
-ADD $BASE_DATA_URL/meta_atlas.sqlite3 /work/root_workspace.db
-
-RUN mkdir -p /root/.local/share/jupyter/kernels/metatlas-targeted
-COPY kernel.json /root/.local/share/jupyter/kernels/metatlas-targeted/kernel.json
-
-WORKDIR /work
-
-CMD ["/usr/local/bin/jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--ServerApp.token=''", "--ServerApp.root_dir=/"]
diff --git a/metatlas/datastructures/metatlas_dataset.py b/metatlas/datastructures/metatlas_dataset.py
index 3796b4fa..4c2164a8 100644
--- a/metatlas/datastructures/metatlas_dataset.py
+++ b/metatlas/datastructures/metatlas_dataset.py
@@ -472,8 +472,8 @@ def store_all_groups(self, exist_ok: bool = False) -> None:
             try:
                 if overlap:
                     raise ValueError(
-                        "Not saving groups as you have already saved groups with these names: %s."
-                        % ", ".join(overlap),
+                        "Not saving groups as you have already saved groups"
+                        f'with these names: {", ".join(overlap)}.'
                     )
             except ValueError as err:
                 logger.exception(err)
@@ -1191,7 +1191,7 @@ def quoted_string_list(strings: List[str]) -> str:
 def append_inverse(in_list: List[str], polarity: Polarity) -> List[str]:
     """appends short version of inverse of polarity to and retuns the list"""
     inverse = {"positive": "NEG", "negative": "POS"}
-    return in_list + [inverse[polarity]] if polarity in inverse.keys() else in_list
+    return in_list + [inverse[polarity]] if polarity in inverse else in_list
 
 
 def remove_items(edit_list: List[str], remove_list: List[str], ignore_case: bool = True) -> List[str]:
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index e518e244..d5aa170c 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -253,8 +253,8 @@ def get_spectra_strings(data, max_pre_intensity, min_mz, max_mz, intensity_fract
     mz_list, intensity_list = get_spectra(
         data, max_pre_intensity, min_mz, max_mz, intensity_fraction, scale_intensity
     )
-    mz_str = str(["%.2f" % x for x in mz_list]).replace("'", "")
-    intensity_str = str(["%d" % x for x in intensity_list]).replace("'", "")
+    mz_str = str([f"{x:.2f}" for x in mz_list]).replace("'", "")
+    intensity_str = str([int(x) for x in intensity_list]).replace("'", "")
     spectra_str = str([mz_str, intensity_str]).replace("'", "")
     name = data["identification"].name
     return {"name": name, "spectrum": spectra_str, "mz": mz_str, "intensity": intensity_str}
diff --git a/metatlas/io/write_utils.py b/metatlas/io/write_utils.py
index 2845dffb..595f3550 100644
--- a/metatlas/io/write_utils.py
+++ b/metatlas/io/write_utils.py
@@ -61,7 +61,7 @@ def raise_on_diff(dataframe, file_path, description, **kwargs):
         logger.info("Data in %s is the same as %s.", description, file_path)
     else:
         try:
-            raise ValueError("Data in %s is not the same as %s." % (description, file_path))
+            raise ValueError(f"Data in {description} is not the same as {file_path}.")
         except ValueError as err:
             logger.exception(err)
             raise
diff --git a/metatlas/tools/environment.py b/metatlas/tools/environment.py
index 4f11451f..3181c319 100644
--- a/metatlas/tools/environment.py
+++ b/metatlas/tools/environment.py
@@ -139,7 +139,7 @@ def create_notebook_with_parameters(source, dest, parameters):
         dest: path of destination notebook
         parameters: dict with name of parameter in key and new value in value
     """
-    with open(source) as source_fh:
+    with open(source, encoding="utf8") as source_fh:
         data = json.load(source_fh)
     eq_pat = re.compile(r"^([^#= ]+)\s*=.+$")
     param_source = data["cells"][1]["source"]
@@ -151,7 +151,7 @@ def create_notebook_with_parameters(source, dest, parameters):
                 new_value = parameters[param_name]
                 out_value = f"'{new_value}'" if isinstance(new_value, str) else new_value
                 param_source[i] = f"{param_name} = {out_value}\n"
-    with open(dest, "w") as out_fh:
+    with open(dest, "w", encoding="utf8") as out_fh:
         json.dump(data, out_fh)
 
 
diff --git a/metatlas/tools/notebook.py b/metatlas/tools/notebook.py
index 145e57de..84a4109a 100644
--- a/metatlas/tools/notebook.py
+++ b/metatlas/tools/notebook.py
@@ -76,7 +76,7 @@ def create_notebook(input_file_name, output_file_name, parameters, injection_cel
         parameters: dict where keys are LHS of assignment and values are RHS of assignment
         injection_cell: zero-indexed number of cell to overwrite with the parameters
     """
-    with open(input_file_name, "r") as in_fh:
+    with open(input_file_name, "r", encoding="utf8") as in_fh:
         notebook = json.load(in_fh)
     notebook["cells"][injection_cell]["source"] = [assignment_string(k, v) for k, v in parameters.items()]
     with open(output_file_name, "w", encoding="utf-8") as out_fh:
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index a095f78e..c8c16987 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -438,7 +438,7 @@ def write_models(file_name, linear_model, poly_model, groups, atlas):
         groups: list of groups used in model generation
         atlas: QC atlas
     """
-    with open(file_name, "w") as out_fh:
+    with open(file_name, "w", encoding="utf8") as out_fh:
         for model in [linear_model, poly_model]:
             out_fh.write(f"{model.sk_model.set_params()}\n")
             out_fh.write(f"{model}\n")
diff --git a/tests/system/utils.py b/tests/system/utils.py
index e66d2a9c..10b695c1 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -17,7 +17,7 @@ def assert_files_match(expected):
     returns None
     """
     for path, contents in expected.items():
-        with open(path, "r") as handle:
+        with open(path, "r", encoding="utf8") as handle:
             expected_lines = contents.split("\n")
             num = None
             for num, line in enumerate(handle.readlines()):
diff --git a/tests/unit/test_environment.py b/tests/unit/test_environment.py
index 1b7d25c2..8adb9e78 100644
--- a/tests/unit/test_environment.py
+++ b/tests/unit/test_environment.py
@@ -21,12 +21,12 @@ def test_create_notebook_with_parameters01():
             },
         ]
     }
-    with open("test.json", "w") as out_fh:
+    with open("test.json", "w", encoding="utf8") as out_fh:
         json.dump(orig_data, out_fh)
     environment.create_notebook_with_parameters(
         "test.json", "out.json", {"param1": 1, "param2": ["foo", "bar"], "param3": "My_Exp_Name"}
     )
-    with open("out.json") as in_fh:
+    with open("out.json", encoding="utf8") as in_fh:
         data = json.load(in_fh)
     assert data["cells"][1]["source"][1] == "param1 = 1\n"
     assert data["cells"][1]["source"][3] == "param2 = ['foo', 'bar']\n"
@@ -48,10 +48,10 @@ def test_create_notebook_with_parameters02():
             },
         ]
     }
-    with open("test.json", "w") as out_fh:
+    with open("test.json", "w", encoding="utf8") as out_fh:
         json.dump(orig_data, out_fh)
     environment.create_notebook_with_parameters("test.json", "out.json", {})
-    with open("out.json") as in_fh:
+    with open("out.json", encoding="utf8") as in_fh:
         data = json.load(in_fh)
     assert data["cells"][1]["source"][1] == "param1 = 0\n"
     assert data["cells"][1]["source"][3] == "param2 = []\n"
@@ -62,10 +62,10 @@ def test_create_notebook_with_parameters03():
     orig_data = {
         "cells": [None, {"source": ["# this is a comment\n", "param1 = True\n", "\n", "param2 = None\n"]}]
     }
-    with open("test.json", "w") as out_fh:
+    with open("test.json", "w", encoding="utf8") as out_fh:
         json.dump(orig_data, out_fh)
     environment.create_notebook_with_parameters("test.json", "out.json", {"param1": None, "param2": True})
-    with open("out.json") as in_fh:
+    with open("out.json", encoding="utf8") as in_fh:
         data = json.load(in_fh)
     assert data["cells"][1]["source"][1] == "param1 = None\n"
     assert data["cells"][1]["source"][3] == "param2 = True\n"

From aa4f5b86bbf60bbc53371eee4dc2cae146271fd3 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 5 Oct 2021 21:36:11 -0700
Subject: [PATCH 169/177] fix whitespace issue in test_add_msms_ref

---
 tests/system/test_add_msms_ref.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/test_add_msms_ref.py b/tests/system/test_add_msms_ref.py
index fff24110..b061453e 100644
--- a/tests/system/test_add_msms_ref.py
+++ b/tests/system/test_add_msms_ref.py
@@ -9,7 +9,7 @@ def test_add_msms_ref_by_line01(tmp_path):
     expected[
         str(tmp_path / "post_sed.tab")
     ] = """database	id	name	spectrum	decimal	precursor_mz	polarity	adduct	fragmentation_method	collision_energy	instrument	instrument_type	formula	exact_mass	inchi_key	inchi\tsmiles
-metatlas	8257da2871be46cfabe10827aaf8d288	L-threonic acid	[[55.7172, 56.1914, 58.0045, 59.0093, 59.0124, 71.0124, 72.9917, 73.0329, 75.0073, 76.0107, 87.0073, 87.212, 89.023, 90.2418, 117.018, 135.029, 135.125, 135.148, 136.032], [6911.62, 9049.5, 11019.7, 7863.62, 293886.0, 204550.0, 763526.0, 7819.21, 4151310.0, 42595.5, 47822.2, 8750.14, 448975.0, 8417.63, 118314.0, 1445940.0, 8032.76, 9857.3, 38018.3]]	4	135.03	negative						C4H8O5	136.037173356	JPIJQSOTBSSVTP-STHAYSLISA-N	InChI=1S/C4H8O5/c5-1-2(6)3(7)4(8)9/h2-3,5-7H,1H2,(H,8,9)/t2-,3+/m0/s1
+metatlas	8257da2871be46cfabe10827aaf8d288	L-threonic acid	[[55.7172, 56.1914, 58.0045, 59.0093, 59.0124, 71.0124, 72.9917, 73.0329, 75.0073, 76.0107, 87.0073, 87.212, 89.023, 90.2418, 117.018, 135.029, 135.125, 135.148, 136.032], [6911.62, 9049.5, 11019.7, 7863.62, 293886.0, 204550.0, 763526.0, 7819.21, 4151310.0, 42595.5, 47822.2, 8750.14, 448975.0, 8417.63, 118314.0, 1445940.0, 8032.76, 9857.3, 38018.3]]	4	135.03	negative						C4H8O5	136.037173356	JPIJQSOTBSSVTP-STHAYSLISA-N	InChI=1S/C4H8O5/c5-1-2(6)3(7)4(8)9/h2-3,5-7H,1H2,(H,8,9)/t2-,3+/m0/s1\t
 NorthernLabAddition:NoDB	REPLACED_UUID	adenosine	[[55.01269, 57.02821, 66.78931, 69.02660, 71.00568, 73.02122, 78.05609, 80.87838, 85.02310, 87.03920, 89.87952, 89.94361, 104.36960, 107.86694, 115.03886, 119.03568, 133.04977, 136.06194, 170.63907, 268.10406], [186032.312, 313596.656, 99986.633, 100581.734, 145571.344, 267841.375, 126804.180, 109123.148, 375497.844, 116243.438, 103804.906, 103371.594, 139217.422, 110503.422, 296771.406, 123998.859, 488822.156, 54508756.000, 102044.547, 18234916.000]]\t4	268.1037292480469	positive	[M+H]+	CID	23.3eV	ThermoQTOF-3000	Orbitrap	C10H13N5O4	267.096753896	OIRDTQYFTABQOQ-KQYNXXCUSA-N	InChI=1S/C10H13N5O4/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(18)6(17)4(1-16)19-10/h2-4,6-7,10,16-18H,1H2,(H2,11,12,13)/t4-,6-,7-,10-/m1/s1	Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O"""
     command = """head -2 /global/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v3.tab > /out/short_refs.tab && \
                     jq -M '(.cells[] | select(.source[] | contains("display_inputs_ui")).source) \

From d8a1ccbf7a00c06152383ea360bc516c854799e8 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 5 Oct 2021 21:36:43 -0700
Subject: [PATCH 170/177] upgrade test dependencies

---
 noxfile.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 51cb09a2..ea39bec7 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -46,14 +46,14 @@
 
 pytest_deps = [
     "attrs==21.2.0",
-    "coverage==5.5",
+    "coverage==6.0",
     "iniconfig==1.1.1",
-    "packaging==20.9",
+    "packaging==21.0",
     "pluggy==0.13.1",
     "py==1.10.0",
     "pyparsing==2.4.7",
     "pytest==6.2.4",
-    "pytest-cov==2.11.1",
+    "pytest-cov==3.0.0",
     "pytest-mock==3.6.1",
     "toml==0.10.2",
 ]
@@ -68,15 +68,15 @@
 ]
 
 pylint_deps = [
-    "pylint==2.8.2",
-    "pytest==6.2.4",  # so "import pytest" doesn't get reported
+    "pylint==2.11.1",
+    "pytest==6.2.5",  # so "import pytest" doesn't get reported
 ]
 
 nbqa_deps = [
     "nbqa==0.8.1",
     "tokenize-rt==4.1.0",
     "importlib-metadata==4.0.1",
-    "astroid==2.5.6",
+    "astroid==2.8.0",
     "wrapt==1.12.1",
     "lazy_object_proxy==1.6.0",
     "isort==5.8.0",
@@ -84,9 +84,9 @@
 
 flake8_deps = [
     "flake8==3.9.2",
-    "flake8-bugbear==21.4.3",
+    "flake8-bugbear==21.9.2",
     "flake8-builtins==1.5.3",
-    "flake8-comprehensions==3.5.0",
+    "flake8-comprehensions==3.6.1",
 ]
 
 nox.options.error_on_external_run = True

From 28e82d7ec50204e35b99ba4f83931c4e3c1e72db Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Tue, 5 Oct 2021 21:37:27 -0700
Subject: [PATCH 171/177] minor linting and style fixes

---
 metatlas/tools/add_msms_ref.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/metatlas/tools/add_msms_ref.py b/metatlas/tools/add_msms_ref.py
index 15e53f09..aad01003 100644
--- a/metatlas/tools/add_msms_ref.py
+++ b/metatlas/tools/add_msms_ref.py
@@ -150,6 +150,7 @@ def __init__(self, identifier, label, basic_type, validator):
 
 class InputDict(TypedDict, total=False):
     """Type for holding one row from input sheet"""
+
     name: str
     molecule_id: str
     adduct: str
@@ -640,11 +641,9 @@ def build_msms_ref(in_dict: InputDict) -> MsmsRef:
             in_dict["mz_tolerance"],
         )
     except IndexError as err:
-        logger.error(f"Matching spectrum not found for {in_dict['name']}")
+        logger.error("Matching spectrum not found for %s.", in_dict["name"])
         raise err
-    ref_dict["polarity"] = (
-        "positive" if cheminfo.is_positive_mode(str(ref_dict["adduct"])) else "negative"
-    )
+    ref_dict["polarity"] = "positive" if cheminfo.is_positive_mode(str(ref_dict["adduct"])) else "negative"
     mol = cheminfo.normalize_molecule(cheminfo.inchi_or_smiles_to_molecule(in_dict["molecule_id"]))
     ref_dict["inchi"] = Chem.inchi.MolToInchi(mol)
     ref_dict["smiles"] = Chem.MolToSmiles(mol)

From 275ee985324722c5f2cf0b66da0763fe9584ed37 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 6 Oct 2021 17:16:43 -0700
Subject: [PATCH 172/177] RT predict system test back to not saving atlases

---
 tests/system/test_rt_predict.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index d680f7fc..9e422bec 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -171,8 +171,8 @@ def test_rt_predict_by_line01(tmp_path):
 0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.043212890625,17.035064697265625,17.05360221862793,17.05995750427246,17.047959327697754,17.048407554626465,17.035064697265625,17.05995750427246,0.011024194876641502,0.005512097438320751,0
 0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.058874130249023,17.06319236755371,17.085508346557617,17.07795524597168,17.071382522583008,17.070573806762695,17.058874130249023,17.085508346557617,0.0124669979531353,0.00623349897656765,0"""
     command = """\
-                    jq -M '(.cells[] | select(.source[] | contains("predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path)")).source) \
-                                = ["predict_rt.generate_rt_correction_models(ids, max_cpus, metatlas_repo_path, model_only=True)"]' \
+                    jq -M '(.cells[] | select(.source[] | contains("predict_rt.generate_outputs(ids, max_cpus, metatlas_repo_path)")).source) \
+                                = ["predict_rt.generate_outputs(ids, max_cpus, metatlas_repo_path, save_to_db=False, model_only=True)"]' \
                                 /src/notebooks/reference/RT_Prediction.ipynb > /out/Remove.ipynb &&  \
                     papermill \
                         -p source_atlas HILICz150_ANT20190824_PRD_EMA_Unlab_POS_20201106_505892_root0 \
@@ -184,5 +184,5 @@ def test_rt_predict_by_line01(tmp_path):
                         /out/Remove-done.ipynb
                    """
     utils.exec_docker(image, command, tmp_path)
-    assert utils.num_files_in(tmp_path) == 49
+    assert utils.num_files_in(tmp_path) == 9
     utils.assert_files_match(expected)

From 781a12cc5bbead27df76626070c3a2bc10519675 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 6 Oct 2021 19:23:45 -0700
Subject: [PATCH 173/177] More verbose assert_files_match()

---
 tests/system/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/system/utils.py b/tests/system/utils.py
index 10b695c1..8a519dc3 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -22,6 +22,10 @@ def assert_files_match(expected):
             num = None
             for num, line in enumerate(handle.readlines()):
                 clean_line = line.rstrip("\n")
+                if expected_lines[num] != clean_line:
+                    print('Expected line differss from actual:')
+                    print('Expected: "{expected_lines[num]}"')
+                    print('Actual:   "{expected_lines[num]}"')
                 assert expected_lines[num] == clean_line
             if num is None and contents == "":
                 continue

From 827c925a2d29b23747edbfb7a7f3fb02c9135437 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 6 Oct 2021 19:32:17 -0700
Subject: [PATCH 174/177] fix syntax of print statements

---
 tests/system/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/system/utils.py b/tests/system/utils.py
index 8a519dc3..25617ac2 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -24,8 +24,8 @@ def assert_files_match(expected):
                 clean_line = line.rstrip("\n")
                 if expected_lines[num] != clean_line:
                     print('Expected line differss from actual:')
-                    print('Expected: "{expected_lines[num]}"')
-                    print('Actual:   "{expected_lines[num]}"')
+                    print(f'Expected: "{expected_lines[num]}"')
+                    print(f'Actual:   "{expected_lines[num]}"')
                 assert expected_lines[num] == clean_line
             if num is None and contents == "":
                 continue

From 2326937bbaba54af46e5cb0a77205cfd120df37a Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 6 Oct 2021 20:01:25 -0700
Subject: [PATCH 175/177] add normalized string comparision to system tests

---
 tests/system/utils.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/system/utils.py b/tests/system/utils.py
index 25617ac2..69160cab 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -2,14 +2,24 @@
 
 import os
 import subprocess
+import unicodedata
 
 
-def num_files_in(path):
+def num_files_in(path) -> int:
     """Returns number of files in path. Does not count directories"""
     return int(subprocess.check_output(f"find {str(path)} -type f | wc -l", shell=True, text=True).strip())
 
 
-def assert_files_match(expected):
+def compare_strs(s_1: str, s_2: str) -> bool:
+    """String comparision with unicode normalization"""
+    def norm_str(in_str: str) -> str:
+        """Unicode string normalization"""
+        return unicodedata.normalize('NFD', in_str)
+
+    return norm_str(s_1) == norm_str(s_2)
+
+
+def assert_files_match(expected) -> None:
     """
     Throw assertion error if expected does not contain the same data as files on disk
     inputs:
@@ -22,17 +32,17 @@ def assert_files_match(expected):
             num = None
             for num, line in enumerate(handle.readlines()):
                 clean_line = line.rstrip("\n")
-                if expected_lines[num] != clean_line:
+                if not compare_strs(expected_lines[num], clean_line):
                     print('Expected line differss from actual:')
                     print(f'Expected: "{expected_lines[num]}"')
                     print(f'Actual:   "{expected_lines[num]}"')
-                assert expected_lines[num] == clean_line
+                assert compare_strs(expected_lines[num], clean_line)
             if num is None and contents == "":
                 continue
             assert len(expected_lines) == num + 1
 
 
-def exec_docker(image, command, out_path):
+def exec_docker(image, command, out_path) -> None:
     """execute command in image with out_path mounted at /out"""
     subprocess.run(
         [

From 63659bbbd48a61f0474f5e8ddaa8a2ab2c2b0996 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 6 Oct 2021 20:37:38 -0700
Subject: [PATCH 176/177] fix logic error in assert_files_match

---
 tests/system/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/utils.py b/tests/system/utils.py
index 69160cab..99378ace 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -35,7 +35,7 @@ def assert_files_match(expected) -> None:
                 if not compare_strs(expected_lines[num], clean_line):
                     print('Expected line differss from actual:')
                     print(f'Expected: "{expected_lines[num]}"')
-                    print(f'Actual:   "{expected_lines[num]}"')
+                    print(f'Actual:   "{clean_line}"')
                 assert compare_strs(expected_lines[num], clean_line)
             if num is None and contents == "":
                 continue

From 7c095590245cfc9636178e6a2b253846dc174e21 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Thu, 7 Oct 2021 07:48:48 -0700
Subject: [PATCH 177/177] fix float output format for consistency

---
 metatlas/io/metatlas_get_data_helper_fun.py |   4 +-
 metatlas/io/targeted_output.py              |   6 +-
 metatlas/plots/dill2plots.py                |   4 +-
 metatlas/tools/fastanalysis.py              |   2 +-
 metatlas/tools/predict_rt.py                |   8 +-
 tests/system/test_rt_predict.py             | 285 ++++++++++----------
 tests/system/test_targeted.py               |  46 ++--
 7 files changed, 174 insertions(+), 181 deletions(-)

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
index 426e985b..1957e3c2 100644
--- a/metatlas/io/metatlas_get_data_helper_fun.py
+++ b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -771,7 +771,7 @@ def make_data_sources_tables(groups, myatlas, output_loc, polarity=None, overwri
     output_dir = os.path.join(output_loc, f"{prefix}data_sources")
     atlas_path = os.path.join(output_dir, f"{prefix}atlas_metadata.tab")
     write_utils.export_dataframe(metob.to_dataframe([myatlas]), atlas_path, "atlas metadata",
-                                 overwrite, sep='\t')
+                                 overwrite, sep='\t', float_format="%.8e")
     groups_path = os.path.join(output_dir, f"{prefix}groups_metadata.tab")
     write_utils.export_dataframe(metob.to_dataframe(groups), groups_path, "groups metadata",
                                  overwrite, sep='\t')
@@ -779,7 +779,7 @@ def make_data_sources_tables(groups, myatlas, output_loc, polarity=None, overwri
     atlas_df = make_atlas_df(myatlas)
     atlas_df['label'] = [cid.name for cid in myatlas.compound_identifications]
     atlas_df_path = os.path.join(output_dir, myatlas.name+'_originalatlas.tab')
-    write_utils.export_dataframe(atlas_df, atlas_df_path, "atlas dataframe", overwrite, sep='\t')
+    write_utils.export_dataframe(atlas_df, atlas_df_path, "atlas dataframe", overwrite, sep='\t', float_format="%.6e")
 
     group_path_df = pd.DataFrame(columns=['group_name', 'group_path', 'file_name'])
     loc_counter = 0
diff --git a/metatlas/io/targeted_output.py b/metatlas/io/targeted_output.py
index d5aa170c..b2487bbc 100644
--- a/metatlas/io/targeted_output.py
+++ b/metatlas/io/targeted_output.py
@@ -26,7 +26,7 @@ def write_atlas_to_spreadsheet(metatlas_dataset, overwrite=False):
     """Save atlas as csv file. Will not overwrite existing file unless overwrite is True"""
     out_file_name = os.path.join(metatlas_dataset.ids.output_dir, f"{metatlas_dataset.atlas.name}_export.csv")
     out_df = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas)
-    write_utils.export_dataframe_die_on_diff(out_df, out_file_name, "atlas", overwrite=overwrite)
+    write_utils.export_dataframe_die_on_diff(out_df, out_file_name, "atlas", overwrite=overwrite, float_format="%.6e")
 
 
 def write_stats_table(
@@ -75,7 +75,7 @@ def write_stats_table(
         min_num_frag_matches,
         min_relative_frag_intensity,
     )
-    write_utils.export_dataframe_die_on_diff(scores_df, scores_path, "scores", overwrite=overwrite)
+    write_utils.export_dataframe_die_on_diff(scores_df, scores_path, "scores", overwrite=overwrite, float_format="%.8e")
     fa.make_stats_table(
         input_dataset=metatlas_dataset,
         msms_hits=metatlas_dataset.hits,
@@ -204,7 +204,7 @@ def write_msms_fragment_ions(
         )
     out_df = pd.DataFrame(out)
     path = os.path.join(data.ids.output_dir, f"spectra_{intensity_fraction:.2f}pct_{int(min_mz)}cut.csv")
-    write_utils.export_dataframe_die_on_diff(out_df, path, "MSMS fragment ions", overwrite=overwrite)
+    write_utils.export_dataframe_die_on_diff(out_df, path, "MSMS fragment ions", overwrite=overwrite, float_format="%.8e")
     return out_df
 
 
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
index 0c16782e..54aff0d2 100644
--- a/metatlas/plots/dill2plots.py
+++ b/metatlas/plots/dill2plots.py
@@ -1464,7 +1464,7 @@ def make_output_dataframe(input_fname='', input_dataset=None, include_lcmsruns=N
     if output_loc:
         prefix = f"{polarity}_" if polarity != '' else ''
         df_path = os.path.join(output_loc, f"{prefix}{fieldname}.tab")
-        write_utils.export_dataframe_die_on_diff(out, df_path, fieldname, overwrite=overwrite, sep="\t")
+        write_utils.export_dataframe_die_on_diff(out, df_path, fieldname, overwrite=overwrite, sep="\t", float_format="%.9e")
     return out
 
 
@@ -2361,7 +2361,7 @@ def no_axis_plot(i):
         plt.close()
         logger.debug('Exported identification figures for %s to %s.', compound_names[compound_idx], fig_path)
     match_path = os.path.join(output_loc, 'MatchingMZs.tab')
-    write_utils.export_dataframe(match, match_path, 'matching MZs', overwrite, sep='\t')
+    write_utils.export_dataframe(match, match_path, 'matching MZs', overwrite, sep='\t', float_format="%.12e")
 
 
 def plot_ms1_spectra(polarity = None, mz_min = 5, mz_max = 5, input_fname = '', input_dataset = [], compound_names = [],  include_lcmsruns = [], exclude_lcmsruns = [], include_groups = [], exclude_groups = [], output_loc = []):
diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
index ab56747d..73eef647 100644
--- a/metatlas/tools/fastanalysis.py
+++ b/metatlas/tools/fastanalysis.py
@@ -409,7 +409,7 @@ def make_stats_table(input_fname = '', input_dataset = [], msms_hits_df = None,
     if output_loc is not None:
         stats_tables_dir = os.path.join(output_loc, f"{prefix}stats_tables")
         stats_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.tab")
-        write_utils.export_dataframe_die_on_diff(stats_table, stats_path, 'stats table', overwrite, sep='\t')
+        write_utils.export_dataframe_die_on_diff(stats_table, stats_path, 'stats table', overwrite, sep='\t', float_format="%.8e")
         readme_path = os.path.join(stats_tables_dir, f"{prefix}stats_table.readme")
         write_utils.check_existing_file(readme_path, overwrite)
         with open(readme_path, 'w') as readme:
diff --git a/metatlas/tools/predict_rt.py b/metatlas/tools/predict_rt.py
index c8c16987..38ccdf64 100644
--- a/metatlas/tools/predict_rt.py
+++ b/metatlas/tools/predict_rt.py
@@ -240,13 +240,13 @@ def load_runs(files_df, qc_atlas_df, qc_atlas, cpus):
 def save_measured_rts(metatlas_dataset, file_name):
     """Save RT values in csv format file"""
     rts_df = get_rts(metatlas_dataset, include_atlas_rt_peak=False)
-    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "measured RT values")
+    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "measured RT values", float_format="%.6e")
 
 
 def save_rt_peak(metatlas_dataset, file_name):
     """Save peak RT values in tsv format file"""
     rts_df = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname="rt_peak", use_labels=True)
-    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "peak RT values", sep="\t")
+    write_utils.export_dataframe_die_on_diff(rts_df, file_name, "peak RT values", sep="\t", float_format="%.6e")
 
 
 def get_rts(metatlas_dataset, include_atlas_rt_peak=True):
@@ -426,7 +426,7 @@ def save_model_comparison(selected_column, qc_atlas_df, rts_df, linear, poly, fi
     # qc_df["RT Polynomial Pred"] = poly.predict(qc_df["RT Reference"].to_numpy())
     qc_df["RT Diff Linear"] = qc_df["RT Measured"] - qc_df["RT Linear Pred"]
     qc_df["RT Diff Polynomial"] = qc_df["RT Measured"] - qc_df["RT Polynomial Pred"]
-    write_utils.export_dataframe_die_on_diff(qc_df, file_name, "model comparision")
+    write_utils.export_dataframe_die_on_diff(qc_df, file_name, "model comparision", float_format="%.6e")
 
 
 def write_models(file_name, linear_model, poly_model, groups, atlas):
@@ -501,7 +501,7 @@ def create_adjusted_atlases(linear, poly, ids, atlas_indices=None, free_text="",
         prd_atlas_df["rt_min"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt - 0.5)
         prd_atlas_df["rt_max"] = prd_atlas_df["rt_peak"].apply(lambda rt: rt + 0.5)
         write_utils.export_dataframe_die_on_diff(
-            prd_atlas_df, prd_atlas_file_name, "predicted atlas", index=False
+            prd_atlas_df, prd_atlas_file_name, "predicted atlas", index=False, float_format="%.6e"
         )
         if save_to_db:
             dp.make_atlas_from_spreadsheet(
diff --git a/tests/system/test_rt_predict.py b/tests/system/test_rt_predict.py
index 9e422bec..d7fbe003 100644
--- a/tests/system/test_rt_predict.py
+++ b/tests/system/test_rt_predict.py
@@ -22,154 +22,155 @@ def test_rt_predict_by_line01(tmp_path):
     expected[
         str(tmp_path / experiment / "root0/data_QC/RT_Predicted_Model_Comparison.csv")
     ] = """,RT Measured,RT Reference,RT Linear Pred,RT Polynomial Pred,RT Diff Linear,RT Diff Polynomial
-0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.7757497429847717,1.068941733,1.062903572946303,1.1269157193507062,-0.28715382996153127,-0.3511659763659345
-0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2491384744644165,1.224396021,1.2180440647072988,1.277093230335882,0.031094409757117747,-0.027954755871465453
-0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.6553537845611572,1.255231064,1.248816864840618,1.3068938537656585,0.4065369197205393,0.34845993079549875
-0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.2450578808784485,1.272043637,1.265595500390302,1.323144126044538,-0.020537619511853622,-0.07808624516608953
-0004_2_4-dihydroxypteridine_unlabeled_positive_M+H165p0407_1p27,,1.272194658,1.2657462165429905,1.32329010157373,,
-0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.8842174410820007,1.393699506,1.3870057978165864,1.4407671152594659,0.49721164326541434,0.44345032582253485
-0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.6583104729652405,1.603927488,1.596809418733121,1.6441748822425457,0.06150105423211949,0.014135590722694769
-0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.3172867894172668,1.616534167,1.6093906501979809,1.656378567437273,-0.29210386078071404,-0.33909177802000623
-0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.9361981749534607,1.876662419,1.8689938136450273,1.9083427073174741,0.06720436130843344,0.027855467635986564
-0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.8230673670768738,1.9552864,1.9474590861922034,1.9845558887665662,-0.12439171911532965,-0.1614885216896924
-0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.1844738721847534,2.158057096,2.149820474204603,2.181230854992953,0.03465339798015021,0.0032430171918003126
-0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,1.8282268047332764,2.196354854,2.1880409252832096,2.2183969913536457,-0.3598141205499332,-0.3901701866203693
-0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.293710947036743,2.234006981,2.2256170486168254,2.254942673466113,0.06809389841991775,0.038768273570630285
-0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.6646790504455566,2.557601998,2.548558864598047,2.56927787344364,0.11612018584750983,0.09540117700191653
-0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.7709169387817383,2.725344,2.7159622666788823,2.732395658981744,0.054954672102855984,0.03852127979999409
-0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.793001413345337,2.750702982,2.741270059655543,2.757066003852173,0.05173135368979409,0.035935409493164094
-0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.048208475112915,2.796993087,2.787466724577183,2.802106149909774,0.2607417505357321,0.2461023252031409
-0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.920204997062683,2.888931519,2.8792195718979,2.8915891009061414,0.040985425164782985,0.028615896156541698
-0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.1090638637542725,3.091018609,3.0808987338207197,3.088405611875559,0.0281651299335528,0.020658251878713507
-0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.1437186002731323,3.102967341,3.092823346401367,3.1000481672384095,0.05089525387176552,0.043670433034722844
-0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.466769218444824,4.418371688,4.4055724502287985,4.385470549114132,0.06119676821602571,0.08129866933069252
-0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,3.9647421836853027,4.448129315,4.435270009207811,4.414635292629683,-0.4705278255225078,-0.44989310894438006
-0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.878586053848267,4.833663875,4.8200263385363336,4.792830070747619,0.05855971531193305,0.0857559831006478
-0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,5.1236891746521,5.064398962,5.050295669310254,5.019475738648949,0.07339350534184597,0.10421343600315058
-0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.435921669006348,5.434235961,5.4193861243530295,5.383231720701441,0.01653554465331819,0.052689948304906586
-0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.63983416557312,5.594117397,5.578944827580915,5.540664965694513,0.060889337992205306,0.09916919987860684
-0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.612481355667114,5.631626786,5.61637850104198,5.577615783318941,-0.0038971453748661844,0.034865572348173224
-0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,6.1643900871276855,5.878913512,5.8631660600806,5.8213702555437035,0.30122402704708584,0.3430198315839821
-0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.9536755084991455,6.87418691,6.856430423443528,6.8050649676035455,0.09724508505561769,0.1486105408956
-0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.825943946838379,6.933566273,6.915689924707814,6.863887108093793,-0.08974597786943495,-0.03794316125541375
-0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.6624157428741455,7.153497474,7.135177177942838,7.081885478010634,-0.4727614350686924,-0.4194697351364889
-0031_betaine_unlabeled_positive_M118p0863_7p91,8.0335111618042,7.905109179,7.8852716978638355,7.828449318591656,0.1482394639403637,0.20506184321254306
-0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.48360538482666,8.570944541,8.549763020821363,8.49182598096076,-0.0661576359947027,-0.008220596134099978
-0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.09137773513794,8.979305704,8.95729987592511,8.899614864870298,0.13407785921283022,0.19176287026764172
-0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.326712608337402,9.319656306,9.296963454490605,9.240032036011817,0.029749153846797327,0.08668057232558546
-0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.878215789794922,9.351932178,9.3291741752016,9.272339873977243,-0.450958385406679,-0.39412408418232125
-0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.116773128509521,9.534507075,9.511380530961908,9.455179262226952,-0.3946074024523867,-0.3384061337174309
-0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.326712608337402,9.70543744,9.681965860090859,9.626486134780636,-0.35525325175345657,-0.29977352644323396
-0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.507513999938965,9.782678891,9.759051393379414,9.703938612775616,-0.251537393440449,-0.19642461283665114
-0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,10.337260723114014,10.15664925,10.132266864922379,10.079290595303439,0.20499385819163507,0.2579701278105748
-0040_methionine_unlabeled_positive_M+H150p0583_10p44,10.456631660461426,10.4409554,10.41599912145348,10.365046300146744,0.04063253900794628,0.09158536031468145
-0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,11.042783260345459,10.78124768,10.755604495746342,10.707526942600817,0.2871787645991173,0.3352563177446424
-0042_proline_unlabeled_positive_M+H116p0706_10p92,10.849864959716797,10.91977168,10.893848874574285,10.847083243109427,-0.043983914857488315,0.0027817166073695887
-0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,10.991784572601318,10.97482181,10.948787881722849,10.902566344225118,0.04299669087846958,0.08921822837620041
-0044_valine_unlabeled_positive_M+H118p0863_11p12,11.041275024414062,11.11600911,11.089690184478288,11.044923184166127,-0.048415160064225304,-0.0036481597520641174
-0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,11.50427532196045,11.65330736,11.625903857319027,11.58744978041825,-0.1216285353585782,-0.08317445845780114
-0046_taurine_unlabeled_positive_M+H126p0219_12p16,12.075395107269287,12.15812344,12.129700927876891,12.098300243547898,-0.05430582060760436,-0.022905136278611238
-0047_ectoine_unlabeled_positive_M+H143p0815_12p50,12.36878776550293,12.50349732,12.474377644575776,12.448428911008959,-0.10558987907284667,-0.07964114550602908
-0048_carnitine_unlabeled_positive_M+H161p1046_13p29,13.466909408569336,13.28582682,13.25512795330405,13.243410597608959,0.21178145526528525,0.22349881096037727
-0049_alanine_unlabeled_positive_M+H90p0550_13p41,13.68701457977295,13.40509074,13.37415113006032,13.364832662828071,0.3128634497126299,0.32218191694487786
-0050_sucrose_unlabeled_positive_M+H343p1235_13p45,13.328831672668457,13.44515078,13.414130305839409,13.405631224802201,-0.08529863317095199,-0.07679955213374434
-0051_threonine_unlabeled_positive_M+H120p0655_13p49,13.459657192230225,13.48957226,13.458462117721956,13.45087963624058,0.0011950745082689451,0.008777555989643915
-0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,13.243738651275635,13.67383331,13.642351222854888,13.638660690201752,-0.3986125715792532,-0.39492203892611677
-0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,13.88631010055542,13.86132281,13.829462261117298,13.829880536300239,0.056847839438121994,0.056429564255180864
-0054_maltose_unlabeled_positive_M+Na365p1054_14p07,13.663294792175293,14.0677773,14.035500007112862,14.040616184637264,-0.3722052149375692,-0.3773213924619707
-0055_serine_unlabeled_positive_M+H106p0499_14p31,14.328335285186768,14.31261357,14.279842056582272,14.290765198503442,0.0484932286044959,0.037570086683325954
-0056_glutamine_unlabeled_positive_M+H147p0764_14p31,14.320549488067627,14.31275825,14.27998644453475,14.290913093540711,0.04056304353287743,0.02963639452691602
-0057_asparagine_unlabeled_positive_M+H133p0608_14p37,14.360477924346924,14.36808894,14.335205445351729,14.347479873380236,0.025272478995194803,0.012998050966688268
-0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,14.392436504364014,14.38565257,14.352733621836048,14.365438606423183,0.0397028825279655,0.02699789794083074
-0059_alpha-ketoglutaric_acid_unlabeled_positive_M+H147p0288_14p51,,14.50646265,14.473299837551405,14.4890020294111,,
-0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,14.69622278213501,14.52081396,14.487622178346628,14.503684552870762,0.20860060378838163,0.19253822926424746
-0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,14.559170722961426,14.53906337,14.505834750532143,14.522356409474206,0.05333597242928256,0.03681431348721986
-0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,14.634858131408691,14.82464623,14.790841139927538,14.81473516440485,-0.15598300851884694,-0.1798770329961581
-0063_citrulline_unlabeled_positive_M+H176p1030_15p09,15.141581535339355,15.08943009,15.055090513677682,15.086130811048331,0.08649102166167388,0.055450724291024045
-0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,15.190487384796143,15.12986101,15.095439820807439,15.127597632861947,0.09504756398870384,0.0628897519341951
-0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,15.118823528289795,15.15757256,15.12309543294764,15.156023222818638,-0.004271904657844772,-0.03719969452884264
-0066_raffinose_unlabeled_positive_M+H505p1763_15p53,15.543988227844238,15.53249857,15.497264626436776,15.540931897821977,0.04672360140746257,0.0030563300222610223
-0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,16.006930351257324,15.93538957,15.899342360478306,15.955218576615934,0.10758799077901848,0.05171177464139021
-0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,16.24086856842041,16.13036002,16.093919247877274,16.15595235329566,0.14694932054313625,0.08491621512474978
-0069_arginine_unlabeled_positive_M+H175p1190_16p94,16.976414680480957,16.93991539,16.901840469127567,16.991172768800162,0.07457421135339004,-0.014758088319204887
-0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.048407554626465,17.01131041,16.973091372879324,17.06496535505383,0.07531618174714083,-0.0165578004273641
-0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.070573806762695,17.03725065,16.998979250542746,17.09178209803637,0.07159455621994937,-0.021208291273673296"""
+0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,7.757497e-01,1.068942e+00,1.062904e+00,1.126916e+00,-2.871538e-01,-3.511660e-01
+0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.249138e+00,1.224396e+00,1.218044e+00,1.277093e+00,3.109441e-02,-2.795476e-02
+0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.655354e+00,1.255231e+00,1.248817e+00,1.306894e+00,4.065369e-01,3.484599e-01
+0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.245058e+00,1.272044e+00,1.265596e+00,1.323144e+00,-2.053762e-02,-7.808625e-02
+0004_2_4-dihydroxypteridine_unlabeled_positive_M+H165p0407_1p27,,1.272195e+00,1.265746e+00,1.323290e+00,,
+0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.884217e+00,1.393700e+00,1.387006e+00,1.440767e+00,4.972116e-01,4.434503e-01
+0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.658310e+00,1.603927e+00,1.596809e+00,1.644175e+00,6.150105e-02,1.413559e-02
+0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.317287e+00,1.616534e+00,1.609391e+00,1.656379e+00,-2.921039e-01,-3.390918e-01
+0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.936198e+00,1.876662e+00,1.868994e+00,1.908343e+00,6.720436e-02,2.785547e-02
+0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.823067e+00,1.955286e+00,1.947459e+00,1.984556e+00,-1.243917e-01,-1.614885e-01
+0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.184474e+00,2.158057e+00,2.149820e+00,2.181231e+00,3.465340e-02,3.243017e-03
+0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,1.828227e+00,2.196355e+00,2.188041e+00,2.218397e+00,-3.598141e-01,-3.901702e-01
+0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.293711e+00,2.234007e+00,2.225617e+00,2.254943e+00,6.809390e-02,3.876827e-02
+0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.664679e+00,2.557602e+00,2.548559e+00,2.569278e+00,1.161202e-01,9.540118e-02
+0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.770917e+00,2.725344e+00,2.715962e+00,2.732396e+00,5.495467e-02,3.852128e-02
+0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.793001e+00,2.750703e+00,2.741270e+00,2.757066e+00,5.173135e-02,3.593541e-02
+0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.048208e+00,2.796993e+00,2.787467e+00,2.802106e+00,2.607418e-01,2.461023e-01
+0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.920205e+00,2.888932e+00,2.879220e+00,2.891589e+00,4.098543e-02,2.861590e-02
+0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.109064e+00,3.091019e+00,3.080899e+00,3.088406e+00,2.816513e-02,2.065825e-02
+0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.143719e+00,3.102967e+00,3.092823e+00,3.100048e+00,5.089525e-02,4.367043e-02
+0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.466769e+00,4.418372e+00,4.405572e+00,4.385471e+00,6.119677e-02,8.129867e-02
+0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,3.964742e+00,4.448129e+00,4.435270e+00,4.414635e+00,-4.705278e-01,-4.498931e-01
+0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.878586e+00,4.833664e+00,4.820026e+00,4.792830e+00,5.855972e-02,8.575598e-02
+0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,5.123689e+00,5.064399e+00,5.050296e+00,5.019476e+00,7.339351e-02,1.042134e-01
+0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.435922e+00,5.434236e+00,5.419386e+00,5.383232e+00,1.653554e-02,5.268995e-02
+0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.639834e+00,5.594117e+00,5.578945e+00,5.540665e+00,6.088934e-02,9.916920e-02
+0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.612481e+00,5.631627e+00,5.616379e+00,5.577616e+00,-3.897145e-03,3.486557e-02
+0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,6.164390e+00,5.878914e+00,5.863166e+00,5.821370e+00,3.012240e-01,3.430198e-01
+0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.953676e+00,6.874187e+00,6.856430e+00,6.805065e+00,9.724509e-02,1.486105e-01
+0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.825944e+00,6.933566e+00,6.915690e+00,6.863887e+00,-8.974598e-02,-3.794316e-02
+0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.662416e+00,7.153497e+00,7.135177e+00,7.081885e+00,-4.727614e-01,-4.194697e-01
+0031_betaine_unlabeled_positive_M118p0863_7p91,8.033511e+00,7.905109e+00,7.885272e+00,7.828449e+00,1.482395e-01,2.050618e-01
+0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.483605e+00,8.570945e+00,8.549763e+00,8.491826e+00,-6.615764e-02,-8.220596e-03
+0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.091378e+00,8.979306e+00,8.957300e+00,8.899615e+00,1.340779e-01,1.917629e-01
+0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.326713e+00,9.319656e+00,9.296963e+00,9.240032e+00,2.974915e-02,8.668057e-02
+0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.878216e+00,9.351932e+00,9.329174e+00,9.272340e+00,-4.509584e-01,-3.941241e-01
+0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.116773e+00,9.534507e+00,9.511381e+00,9.455179e+00,-3.946074e-01,-3.384061e-01
+0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.326713e+00,9.705437e+00,9.681966e+00,9.626486e+00,-3.552533e-01,-2.997735e-01
+0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.507514e+00,9.782679e+00,9.759051e+00,9.703939e+00,-2.515374e-01,-1.964246e-01
+0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,1.033726e+01,1.015665e+01,1.013227e+01,1.007929e+01,2.049939e-01,2.579701e-01
+0040_methionine_unlabeled_positive_M+H150p0583_10p44,1.045663e+01,1.044096e+01,1.041600e+01,1.036505e+01,4.063254e-02,9.158536e-02
+0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,1.104278e+01,1.078125e+01,1.075560e+01,1.070753e+01,2.871788e-01,3.352563e-01
+0042_proline_unlabeled_positive_M+H116p0706_10p92,1.084986e+01,1.091977e+01,1.089385e+01,1.084708e+01,-4.398391e-02,2.781717e-03
+0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,1.099178e+01,1.097482e+01,1.094879e+01,1.090257e+01,4.299669e-02,8.921823e-02
+0044_valine_unlabeled_positive_M+H118p0863_11p12,1.104128e+01,1.111601e+01,1.108969e+01,1.104492e+01,-4.841516e-02,-3.648160e-03
+0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,1.150428e+01,1.165331e+01,1.162590e+01,1.158745e+01,-1.216285e-01,-8.317446e-02
+0046_taurine_unlabeled_positive_M+H126p0219_12p16,1.207540e+01,1.215812e+01,1.212970e+01,1.209830e+01,-5.430582e-02,-2.290514e-02
+0047_ectoine_unlabeled_positive_M+H143p0815_12p50,1.236879e+01,1.250350e+01,1.247438e+01,1.244843e+01,-1.055899e-01,-7.964115e-02
+0048_carnitine_unlabeled_positive_M+H161p1046_13p29,1.346691e+01,1.328583e+01,1.325513e+01,1.324341e+01,2.117815e-01,2.234988e-01
+0049_alanine_unlabeled_positive_M+H90p0550_13p41,1.368701e+01,1.340509e+01,1.337415e+01,1.336483e+01,3.128634e-01,3.221819e-01
+0050_sucrose_unlabeled_positive_M+H343p1235_13p45,1.332883e+01,1.344515e+01,1.341413e+01,1.340563e+01,-8.529863e-02,-7.679955e-02
+0051_threonine_unlabeled_positive_M+H120p0655_13p49,1.345966e+01,1.348957e+01,1.345846e+01,1.345088e+01,1.195075e-03,8.777556e-03
+0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,1.324374e+01,1.367383e+01,1.364235e+01,1.363866e+01,-3.986126e-01,-3.949220e-01
+0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,1.388631e+01,1.386132e+01,1.382946e+01,1.382988e+01,5.684784e-02,5.642956e-02
+0054_maltose_unlabeled_positive_M+Na365p1054_14p07,1.366329e+01,1.406778e+01,1.403550e+01,1.404062e+01,-3.722052e-01,-3.773214e-01
+0055_serine_unlabeled_positive_M+H106p0499_14p31,1.432834e+01,1.431261e+01,1.427984e+01,1.429077e+01,4.849323e-02,3.757009e-02
+0056_glutamine_unlabeled_positive_M+H147p0764_14p31,1.432055e+01,1.431276e+01,1.427999e+01,1.429091e+01,4.056304e-02,2.963639e-02
+0057_asparagine_unlabeled_positive_M+H133p0608_14p37,1.436048e+01,1.436809e+01,1.433521e+01,1.434748e+01,2.527248e-02,1.299805e-02
+0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,1.439244e+01,1.438565e+01,1.435273e+01,1.436544e+01,3.970288e-02,2.699790e-02
+0059_alpha-ketoglutaric_acid_unlabeled_positive_M+H147p0288_14p51,,1.450646e+01,1.447330e+01,1.448900e+01,,
+0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,1.469622e+01,1.452081e+01,1.448762e+01,1.450368e+01,2.086006e-01,1.925382e-01
+0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,1.455917e+01,1.453906e+01,1.450583e+01,1.452236e+01,5.333597e-02,3.681431e-02
+0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,1.463486e+01,1.482465e+01,1.479084e+01,1.481474e+01,-1.559830e-01,-1.798770e-01
+0063_citrulline_unlabeled_positive_M+H176p1030_15p09,1.514158e+01,1.508943e+01,1.505509e+01,1.508613e+01,8.649102e-02,5.545072e-02
+0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,1.519049e+01,1.512986e+01,1.509544e+01,1.512760e+01,9.504756e-02,6.288975e-02
+0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,1.511882e+01,1.515757e+01,1.512310e+01,1.515602e+01,-4.271905e-03,-3.719969e-02
+0066_raffinose_unlabeled_positive_M+H505p1763_15p53,1.554399e+01,1.553250e+01,1.549726e+01,1.554093e+01,4.672360e-02,3.056330e-03
+0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,1.600693e+01,1.593539e+01,1.589934e+01,1.595522e+01,1.075880e-01,5.171177e-02
+0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,1.624087e+01,1.613036e+01,1.609392e+01,1.615595e+01,1.469493e-01,8.491622e-02
+0069_arginine_unlabeled_positive_M+H175p1190_16p94,1.697641e+01,1.693992e+01,1.690184e+01,1.699117e+01,7.457421e-02,-1.475809e-02
+0070_lysine_unlabeled_positive_M+H147p1128_17p01,1.704841e+01,1.701131e+01,1.697309e+01,1.706497e+01,7.531618e-02,-1.655780e-02
+0071_ornithine_unlabeled_positive_M+H133p0972_17p04,1.707057e+01,1.703725e+01,1.699898e+01,1.709178e+01,7.159456e-02,-2.120829e-02"""
 
     expected[
         str(tmp_path / experiment / "root0/data_QC/QC_Measured_RTs.csv")
     ] = """,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Pre_Rg70to1050-CE102040--QC_Run6.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Pre_Rg70to1050-CE102040--QC_Run7.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_FPS_MS1_0_QC_Post_Rg70to1050-CE102040--QC_Run307.h5,20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_0_QC_Post_Rg70to1050-CE102040--QC_Run308.h5,mean,median,min,max,standard deviation,standard error,#NaNs
-0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,0.760883629322052,0.7883098125457764,0.7816191911697388,0.7698802947998047,0.775173231959343,0.7757497429847717,0.760883629322052,0.7883098125457764,0.012197377682005251,0.006098688841002626,0
-0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.2340805530548096,1.2472213506698608,1.2510555982589722,1.2544312477111816,1.246697187423706,1.2491384744644165,1.2340805530548096,1.2544312477111816,0.008911895485740983,0.004455947742870492,0
-0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.6460527181625366,1.656285285949707,1.6621986627578735,1.6544222831726074,1.6547397375106812,1.6553537845611572,1.6460527181625366,1.6621986627578735,0.0066730644692519535,0.0033365322346259768,0
-0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.6460527181625366,0.772739052772522,1.7125025987625122,0.8440630435943604,1.2438393533229828,1.2450578808784485,0.772739052772522,1.7125025987625122,0.5043733469961971,0.25218667349809853,0
+0000_4-methoxyphenylacetic_acid_unlabeled_positive_M+H167p0703_1p07,7.608836e-01,7.883098e-01,7.816192e-01,7.698803e-01,7.751732e-01,7.757497e-01,7.608836e-01,7.883098e-01,1.219738e-02,6.098689e-03,0
+0001_nicotinamide_unlabeled_positive_M+H123p0553_1p22,1.234081e+00,1.247221e+00,1.251056e+00,1.254431e+00,1.246697e+00,1.249138e+00,1.234081e+00,1.254431e+00,8.911895e-03,4.455948e-03,0
+0002_thymine_unlabeled_positive_M+H127p0502_1p26,1.646053e+00,1.656285e+00,1.662199e+00,1.654422e+00,1.654740e+00,1.655354e+00,1.646053e+00,1.662199e+00,6.673064e-03,3.336532e-03,0
+0003_benzoic_acid_unlabeled_positive_M+H123p0441_1p27,1.646053e+00,7.727391e-01,1.712503e+00,8.440630e-01,1.243839e+00,1.245058e+00,7.727391e-01,1.712503e+00,5.043733e-01,2.521867e-01,0
 0004_2_4-dihydroxypteridine_unlabeled_positive_M+H165p0407_1p27,,,,,,,,,,,0
-0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.8791532516479492,1.8892816305160522,1.878132700920105,1.8925062417984009,1.8847684562206268,1.8842174410820007,1.878132700920105,1.8925062417984009,0.00720661359839974,0.00360330679919987,0
-0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.6460527181625366,1.6672121286392212,1.6621986627578735,1.6544222831726074,1.6574714481830597,1.6583104729652405,1.6460527181625366,1.6672121286392212,0.009254048049870576,0.004627024024935288,0
-0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.4245437383651733,1.2100298404693604,1.6621986627578735,1.1872817277908325,1.37101349234581,1.3172867894172668,1.1872817277908325,1.6621986627578735,0.22160579729537735,0.11080289864768868,0
-0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.940125823020935,1.9492311477661133,1.910657525062561,1.9322705268859863,1.933071255683899,1.9361981749534607,1.910657525062561,1.9492311477661133,0.016471445184473014,0.008235722592236507,0
-0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.8015246391296387,1.7716032266616821,1.8446100950241089,1.8571667671203613,1.8187261819839478,1.8230673670768738,1.7716032266616821,1.8571667671203613,0.03942977527526356,0.01971488763763178,0
-0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.140230655670166,2.1597485542297363,2.2091991901397705,2.2232186794281006,2.1830992698669434,2.1844738721847534,2.140230655670166,2.2232186794281006,0.03947043984935922,0.01973521992467961,0
-0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,,1.8414007425308228,1.8282268047332764,1.78067147731781,1.816766341527303,1.8282268047332764,1.78067147731781,1.8414007425308228,0.03194554078427394,0.01844376657120873,0
-0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.2475903034210205,2.2757790088653564,2.322835922241211,2.31164288520813,2.2894620299339294,2.293710947036743,2.2475903034210205,2.322835922241211,0.03438155307316169,0.017190776536580844,0
-0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.593029737472534,2.6252453327178955,2.7041127681732178,2.7198987007141113,2.6605716347694397,2.6646790504455566,2.593029737472534,2.7198987007141113,0.06117021758664283,0.030585108793321415,0
-0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.747974395751953,2.775257110595703,2.7665767669677734,2.788285493850708,2.7695234417915344,2.7709169387817383,2.747974395751953,2.788285493850708,0.01691088242483735,0.008455441212418676,0
-0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.763174295425415,2.769817352294922,2.835042953491211,2.816185474395752,2.796055018901825,2.793001413345337,2.763174295425415,2.835042953491211,0.03509440319693131,0.017547201598465654,0
-0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.136334180831909,2.960082769393921,,,3.048208475112915,3.048208475112915,2.960082769393921,3.136334180831909,0.12462856822150174,0.08812570571899414,0
-0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.919912815093994,2.943791389465332,2.917609214782715,2.920497179031372,2.9254526495933533,2.920204997062683,2.917609214782715,2.943791389465332,0.012289227050439883,0.0061446135252199415,0
-0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.075451612472534,3.078127861022949,3.1469616889953613,3.1399998664855957,3.11013525724411,3.1090638637542725,3.075451612472534,3.1469616889953613,0.03862429972687937,0.019312149863439685,0
-0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.136334180831909,3.1404755115509033,3.1469616889953613,3.161953926086426,3.14643132686615,3.1437186002731323,3.136334180831909,3.161953926086426,0.011234715018600752,0.005617357509300376,0
-0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.396528244018555,4.399042129516602,4.534496307373047,4.545421600341797,4.4688720703125,4.466769218444824,4.396528244018555,4.545421600341797,0.08221155813284872,0.04110577906642436,0
-0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,,,,3.9647421836853027,3.9647421836853027,3.9647421836853027,3.9647421836853027,3.9647421836853027,,,0
-0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.836014747619629,4.8464555740356445,4.910716533660889,4.921308994293213,4.878623962402344,4.878586053848267,4.836014747619629,4.921308994293213,0.04359776550856559,0.021798882754282795,0
-0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,,5.1236891746521,,,5.1236891746521,5.1236891746521,5.1236891746521,5.1236891746521,,,0
-0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.432004451751709,5.439838886260986,5.39183235168457,5.441697120666504,5.426343202590942,5.435921669006348,5.39183235168457,5.441697120666504,0.023387495592980652,0.011693747796490326,0
-0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.601799011230469,5.607846260070801,5.6718220710754395,5.671968460083008,5.638358950614929,5.63983416557312,5.601799011230469,5.671968460083008,0.03880306263547012,0.01940153131773506,0
-0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.478054046630859,5.553140640258789,5.6718220710754395,5.700730800628662,5.6009368896484375,5.612481355667114,5.478054046630859,5.700730800628662,0.10387204978316382,0.05193602489158191,0
-0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,,6.089038848876953,6.223822593688965,6.1643900871276855,6.159083843231201,6.1643900871276855,6.089038848876953,6.223822593688965,0.06754836515123151,0.03899906680338265,0
-0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.925775527954102,6.942814826965332,6.964536190032959,6.967518329620361,6.9501612186431885,6.9536755084991455,6.925775527954102,6.967518329620361,0.019634497574413202,0.009817248787206601,0
-0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.8023223876953125,6.832950592041016,6.818937301635742,6.858341693878174,6.828137993812561,6.825943946838379,6.8023223876953125,6.858341693878174,0.023710214782490673,0.011855107391245337,0
-0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.656971454620361,6.657993316650391,6.6668381690979,6.696530342102051,6.669583320617676,6.6624157428741455,6.656971454620361,6.696530342102051,0.018502839238360106,0.009251419619180053,0
-0031_betaine_unlabeled_positive_M118p0863_7p91,7.914710998535156,7.898166656494141,8.152311325073242,8.210945129394531,8.044033527374268,8.0335111618042,7.898166656494141,8.210945129394531,0.1608156551761606,0.0804078275880803,0
-0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.49014663696289,8.50051498413086,8.451272010803223,8.47706413269043,8.47974944114685,8.48360538482666,8.451272010803223,8.50051498413086,0.021271924977460343,0.010635962488730171,0
-0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.065049171447754,9.064212799072266,9.132002830505371,9.117706298828125,9.094742774963379,9.09137773513794,9.064212799072266,9.132002830505371,0.03525821268363917,0.017629106341819585,0
-0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.267672538757324,9.295421600341797,9.358003616333008,9.375864028930664,9.324240446090698,9.326712608337402,9.267672538757324,9.375864028930664,0.051105772604218114,0.025552886302109057,0
-0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.894339561462402,8.865837097167969,8.890594482421875,8.856619834899902,8.876847743988037,8.878215789794922,8.856619834899902,8.894339561462402,0.018487285230656875,0.009243642615328437,0
-0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.065049171447754,9.164083480834961,9.148656845092773,9.08488941192627,9.11566972732544,9.116773128509521,9.065049171447754,9.164083480834961,0.0481037419227615,0.02405187096138075,0
-0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.267672538757324,9.295421600341797,9.358003616333008,9.375864028930664,9.324240446090698,9.326712608337402,9.267672538757324,9.375864028930664,0.051105772604218114,0.025552886302109057,0
-0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.526045799255371,9.48938274383545,9.510781288146973,9.504246711730957,9.507614135742188,9.507513999938965,9.48938274383545,9.526045799255371,0.015203949810787694,0.007601974905393847,0
-0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,10.33144474029541,10.327781677246094,10.347268104553223,10.343076705932617,10.337392807006836,10.337260723114014,10.327781677246094,10.347268104553223,0.009266094493898432,0.004633047246949216,0
-0040_methionine_unlabeled_positive_M+H150p0583_10p44,10.443852424621582,10.439347267150879,10.46941089630127,10.474287986755371,10.456724643707275,10.456631660461426,10.439347267150879,10.474287986755371,0.01767370234818388,0.00883685117409194,0
-0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,11.050666809082031,11.034899711608887,,,11.042783260345459,11.042783260345459,11.034899711608887,11.050666809082031,0.011149021542889777,0.007883548736572266,0
-0042_proline_unlabeled_positive_M+H116p0706_10p92,10.827990531921387,10.806012153625488,10.871739387512207,10.887017250061035,10.84818983078003,10.849864959716797,10.806012153625488,10.887017250061035,0.03763472791114847,0.018817363955574234,0
-0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,10.955867767333984,10.943641662597656,11.027701377868652,11.047686576843262,10.993724346160889,10.991784572601318,10.943641662597656,11.047686576843262,0.05166480677489114,0.02583240338744557,0
-0044_valine_unlabeled_positive_M+H118p0863_11p12,11.034863471984863,11.012262344360352,11.061145782470703,11.047686576843262,11.038989543914795,11.041275024414062,11.012262344360352,11.061145782470703,0.020799879441150262,0.010399939720575131,0
-0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,11.499093055725098,11.508075714111328,11.50047492980957,11.510473251342773,11.504529237747192,11.50427532196045,11.499093055725098,11.510473251342773,0.00559458905420339,0.002797294527101695,0
-0046_taurine_unlabeled_positive_M+H126p0219_12p16,12.104499816894531,12.101568222045898,12.045284271240234,12.049221992492676,12.075143575668335,12.075395107269287,12.045284271240234,12.104499816894531,0.03226741065106046,0.01613370532553023,0
-0047_ectoine_unlabeled_positive_M+H143p0815_12p50,12.293835639953613,12.278909683227539,12.455086708068848,12.443739891052246,12.367892980575562,12.36878776550293,12.278909683227539,12.455086708068848,0.09444225193198486,0.04722112596599243,0
-0048_carnitine_unlabeled_positive_M+H161p1046_13p29,,13.355096817016602,,13.57872200012207,13.466909408569336,13.466909408569336,13.355096817016602,13.57872200012207,0.15812688341796033,0.11181259155273438,0
-0049_alanine_unlabeled_positive_M+H90p0550_13p41,,13.68701457977295,,,13.68701457977295,13.68701457977295,13.68701457977295,13.68701457977295,,,0
-0050_sucrose_unlabeled_positive_M+H343p1235_13p45,,,13.34303092956543,13.314632415771484,13.328831672668457,13.328831672668457,13.314632415771484,13.34303092956543,0.02008078167931844,0.014199256896972656,0
-0051_threonine_unlabeled_positive_M+H120p0655_13p49,13.462236404418945,13.455792427062988,13.457077980041504,13.46524429321289,13.460087776184082,13.459657192230225,13.455792427062988,13.46524429321289,0.004423993974227501,0.0022119969871137505,0
-0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,13.227526664733887,13.222009658813477,13.259950637817383,13.277363777160645,13.246712684631348,13.243738651275635,13.222009658813477,13.277363777160645,0.026413858243097502,0.013206929121548751,0
-0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,13.855559349060059,13.86020565032959,13.917991638183594,13.91241455078125,13.886542797088623,13.88631010055542,13.855559349060059,13.917991638183594,0.03322647837942044,0.01661323918971022,0
-0054_maltose_unlabeled_positive_M+Na365p1054_14p07,13.650632858276367,13.675956726074219,13.638050079345703,13.978111267089844,13.735687732696533,13.663294792175293,13.638050079345703,13.978111267089844,0.16238268195258865,0.08119134097629432,0
-0055_serine_unlabeled_positive_M+H106p0499_14p31,14.32981014251709,14.33325481414795,14.326860427856445,14.324366569519043,14.328572988510132,14.328335285186768,14.324366569519043,14.33325481414795,0.0038330521372662756,0.0019165260686331378,0
-0056_glutamine_unlabeled_positive_M+H147p0764_14p31,14.314238548278809,14.306297302246094,14.326860427856445,14.336118698120117,14.320878744125366,14.320549488067627,14.306297302246094,14.336118698120117,0.013225573537406289,0.006612786768703144,0
-0057_asparagine_unlabeled_positive_M+H133p0608_14p37,14.36136245727539,14.355669021606445,14.359593391418457,14.37166690826416,14.362072944641113,14.360477924346924,14.355669021606445,14.37166690826416,0.006824156717503251,0.0034120783587516254,0
-0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,14.37686824798584,14.366856575012207,14.408004760742188,14.413251876831055,14.391245365142822,14.392436504364014,14.366856575012207,14.413251876831055,0.022852268153811535,0.011426134076905767,0
+0005_uracil_unlabeled_positive_M+H113p0346_1p39,1.879153e+00,1.889282e+00,1.878133e+00,1.892506e+00,1.884768e+00,1.884217e+00,1.878133e+00,1.892506e+00,7.206614e-03,3.603307e-03,0
+0006_thymidine_unlabeled_positive_M+H243p0975_1p60,1.646053e+00,1.667212e+00,1.662199e+00,1.654422e+00,1.657471e+00,1.658310e+00,1.646053e+00,1.667212e+00,9.254048e-03,4.627024e-03,0
+0007_2-hydroxyphenylacetic_acid_unlabeled_positive_M+H153p0546_1p62,1.424544e+00,1.210030e+00,1.662199e+00,1.187282e+00,1.371013e+00,1.317287e+00,1.187282e+00,1.662199e+00,2.216058e-01,1.108029e-01,0
+0008_deoxyuridine_unlabeled_positive_M+H229p0819_1p88,1.940126e+00,1.949231e+00,1.910658e+00,1.932271e+00,1.933071e+00,1.936198e+00,1.910658e+00,1.949231e+00,1.647145e-02,8.235723e-03,0
+0009_acetylcholine_unlabeled_positive_M146p1176_1p96,1.801525e+00,1.771603e+00,1.844610e+00,1.857167e+00,1.818726e+00,1.823067e+00,1.771603e+00,1.857167e+00,3.942978e-02,1.971489e-02,0
+0010_pyridoxine_unlabeled_positive_M+H170p0812_2p16,2.140231e+00,2.159749e+00,2.209199e+00,2.223219e+00,2.183099e+00,2.184474e+00,2.140231e+00,2.223219e+00,3.947044e-02,1.973522e-02,0
+0011_salicylic_acid_unlabeled_positive_M+H139p0390_2p20,,1.841401e+00,1.828227e+00,1.780671e+00,1.816766e+00,1.828227e+00,1.780671e+00,1.841401e+00,3.194554e-02,1.844377e-02,0
+0012_2deoxyadenosine_unlabeled_positive_M+H252p1091_2p23,2.247590e+00,2.275779e+00,2.322836e+00,2.311643e+00,2.289462e+00,2.293711e+00,2.247590e+00,2.322836e+00,3.438155e-02,1.719078e-02,0
+0013_adenine_unlabeled_positive_M+H136p0618_2p56,2.593030e+00,2.625245e+00,2.704113e+00,2.719899e+00,2.660572e+00,2.664679e+00,2.593030e+00,2.719899e+00,6.117022e-02,3.058511e-02,0
+0014_xanthine_unlabeled_positive_M+H153p0407_2p73,2.747974e+00,2.775257e+00,2.766577e+00,2.788285e+00,2.769523e+00,2.770917e+00,2.747974e+00,2.788285e+00,1.691088e-02,8.455441e-03,0
+0015_ribose_unlabeled_positive_M+H151p0601_2p75,2.763174e+00,2.769817e+00,2.835043e+00,2.816185e+00,2.796055e+00,2.793001e+00,2.763174e+00,2.835043e+00,3.509440e-02,1.754720e-02,0
+0016_rhamnose_unlabeled_positive_M+H165p0757_2p80,3.136334e+00,2.960083e+00,,,3.048208e+00,3.048208e+00,2.960083e+00,3.136334e+00,1.246286e-01,8.812571e-02,0
+0017_uridine_unlabeled_positive_M+H245p0768_2p89,2.919913e+00,2.943791e+00,2.917609e+00,2.920497e+00,2.925453e+00,2.920205e+00,2.917609e+00,2.943791e+00,1.228923e-02,6.144614e-03,0
+0018_adenosine_unlabeled_positive_M+H268p1040_3p09,3.075452e+00,3.078128e+00,3.146962e+00,3.140000e+00,3.110135e+00,3.109064e+00,3.075452e+00,3.146962e+00,3.862430e-02,1.931215e-02,0
+0019_hypoxanthine_unlabeled_positive_M+H137p0458_3p10,3.136334e+00,3.140476e+00,3.146962e+00,3.161954e+00,3.146431e+00,3.143719e+00,3.136334e+00,3.161954e+00,1.123472e-02,5.617358e-03,0
+0020_5-methylcytosine_unlabeled_positive_M+H126p0662_4p42,4.396528e+00,4.399042e+00,4.534496e+00,4.545422e+00,4.468872e+00,4.466769e+00,4.396528e+00,4.545422e+00,8.221156e-02,4.110578e-02,0
+0021_2-oxovaleric_acid_unlabeled_positive_M+H117p0546_4p45,,,,3.964742e+00,3.964742e+00,3.964742e+00,3.964742e+00,3.964742e+00,,,0
+0022_cytosine_unlabeled_positive_M+H112p0505_4p83,4.836015e+00,4.846456e+00,4.910717e+00,4.921309e+00,4.878624e+00,4.878586e+00,4.836015e+00,4.921309e+00,4.359777e-02,2.179888e-02,0
+0023_lactic_acid_unlabeled_positive_M+H91p0390_5p06,,5.123689e+00,,,5.123689e+00,5.123689e+00,5.123689e+00,5.123689e+00,,,0
+0024_inosine_unlabeled_positive_M+H269p0880_5p43,5.432004e+00,5.439839e+00,5.391832e+00,5.441697e+00,5.426343e+00,5.435922e+00,5.391832e+00,5.441697e+00,2.338750e-02,1.169375e-02,0
+0025_deoxycytidine_unlabeled_positive_M+H228p0979_5p59,5.601799e+00,5.607846e+00,5.671822e+00,5.671968e+00,5.638359e+00,5.639834e+00,5.601799e+00,5.671968e+00,3.880306e-02,1.940153e-02,0
+0026_nicotinic_acid_unlabeled_positive_M+H124p0393_5p63,5.478054e+00,5.553141e+00,5.671822e+00,5.700731e+00,5.600937e+00,5.612481e+00,5.478054e+00,5.700731e+00,1.038720e-01,5.193602e-02,0
+0027_phenylacetic_acid_unlabeled_positive_M+H137p0597_5p88,,6.089039e+00,6.223823e+00,6.164390e+00,6.159084e+00,6.164390e+00,6.089039e+00,6.223823e+00,6.754837e-02,3.899907e-02,0
+0028_2_deoxyguanosine_unlabeled_positive_M+H268p1040_6p87,6.925776e+00,6.942815e+00,6.964536e+00,6.967518e+00,6.950161e+00,6.953676e+00,6.925776e+00,6.967518e+00,1.963450e-02,9.817249e-03,0
+0029_cytidine_unlabeled_positive_M+H244p0928_6p93,6.802322e+00,6.832951e+00,6.818937e+00,6.858342e+00,6.828138e+00,6.825944e+00,6.802322e+00,6.858342e+00,2.371021e-02,1.185511e-02,0
+0030_N-acetyl-mannosamine_unlabeled_positive_M+Na244p0792_7p15,6.656971e+00,6.657993e+00,6.666838e+00,6.696530e+00,6.669583e+00,6.662416e+00,6.656971e+00,6.696530e+00,1.850284e-02,9.251420e-03,0
+0031_betaine_unlabeled_positive_M118p0863_7p91,7.914711e+00,7.898167e+00,8.152311e+00,8.210945e+00,8.044034e+00,8.033511e+00,7.898167e+00,8.210945e+00,1.608157e-01,8.040783e-02,0
+0032_guanosine_unlabeled_positive_M+H284p0989_8p57,8.490147e+00,8.500515e+00,8.451272e+00,8.477064e+00,8.479749e+00,8.483605e+00,8.451272e+00,8.500515e+00,2.127192e-02,1.063596e-02,0
+0033_phenylalanine_unlabeled_positive_M+H166p0863_8p98,9.065049e+00,9.064213e+00,9.132003e+00,9.117706e+00,9.094743e+00,9.091378e+00,9.064213e+00,9.132003e+00,3.525821e-02,1.762911e-02,0
+0034_leucine_unlabeled_positive_M+H132p1019_9p32,9.267673e+00,9.295422e+00,9.358004e+00,9.375864e+00,9.324240e+00,9.326713e+00,9.267673e+00,9.375864e+00,5.110577e-02,2.555289e-02,0
+0035_urocanic_acid_unlabeled_positive_M+H139p0502_9p35,8.894340e+00,8.865837e+00,8.890594e+00,8.856620e+00,8.876848e+00,8.878216e+00,8.856620e+00,8.894340e+00,1.848729e-02,9.243643e-03,0
+0036_mannitol_unlabeled_positive_M+H183p0863_9p53,9.065049e+00,9.164083e+00,9.148657e+00,9.084889e+00,9.115670e+00,9.116773e+00,9.065049e+00,9.164083e+00,4.810374e-02,2.405187e-02,0
+0037_isoleucine_unlabeled_positive_M+H132p1019_9p71,9.267673e+00,9.295422e+00,9.358004e+00,9.375864e+00,9.324240e+00,9.326713e+00,9.267673e+00,9.375864e+00,5.110577e-02,2.555289e-02,0
+0038_xanthosine_unlabeled_positive_M+H285p0830_9p78,9.526046e+00,9.489383e+00,9.510781e+00,9.504247e+00,9.507614e+00,9.507514e+00,9.489383e+00,9.526046e+00,1.520395e-02,7.601975e-03,0
+0039_tryptophan_unlabeled_positive_M+H205p0972_10p16,1.033144e+01,1.032778e+01,1.034727e+01,1.034308e+01,1.033739e+01,1.033726e+01,1.032778e+01,1.034727e+01,9.266094e-03,4.633047e-03,0
+0040_methionine_unlabeled_positive_M+H150p0583_10p44,1.044385e+01,1.043935e+01,1.046941e+01,1.047429e+01,1.045672e+01,1.045663e+01,1.043935e+01,1.047429e+01,1.767370e-02,8.836851e-03,0
+0041_1-methyladenosine_unlabeled_positive_M+H282p1197_10p78,1.105067e+01,1.103490e+01,,,1.104278e+01,1.104278e+01,1.103490e+01,1.105067e+01,1.114902e-02,7.883549e-03,0
+0042_proline_unlabeled_positive_M+H116p0706_10p92,1.082799e+01,1.080601e+01,1.087174e+01,1.088702e+01,1.084819e+01,1.084986e+01,1.080601e+01,1.088702e+01,3.763473e-02,1.881736e-02,0
+0043_pipecolic_acid_unlabeled_positive_M+H130p0863_10p97,1.095587e+01,1.094364e+01,1.102770e+01,1.104769e+01,1.099372e+01,1.099178e+01,1.094364e+01,1.104769e+01,5.166481e-02,2.583240e-02,0
+0044_valine_unlabeled_positive_M+H118p0863_11p12,1.103486e+01,1.101226e+01,1.106115e+01,1.104769e+01,1.103899e+01,1.104128e+01,1.101226e+01,1.106115e+01,2.079988e-02,1.039994e-02,0
+0045_5-oxo-proline_unlabeled_positive_M+H130p0499_11p65,1.149909e+01,1.150808e+01,1.150047e+01,1.151047e+01,1.150453e+01,1.150428e+01,1.149909e+01,1.151047e+01,5.594589e-03,2.797295e-03,0
+0046_taurine_unlabeled_positive_M+H126p0219_12p16,1.210450e+01,1.210157e+01,1.204528e+01,1.204922e+01,1.207514e+01,1.207540e+01,1.204528e+01,1.210450e+01,3.226741e-02,1.613371e-02,0
+0047_ectoine_unlabeled_positive_M+H143p0815_12p50,1.229384e+01,1.227891e+01,1.245509e+01,1.244374e+01,1.236789e+01,1.236879e+01,1.227891e+01,1.245509e+01,9.444225e-02,4.722113e-02,0
+0048_carnitine_unlabeled_positive_M+H161p1046_13p29,,1.335510e+01,,1.357872e+01,1.346691e+01,1.346691e+01,1.335510e+01,1.357872e+01,1.581269e-01,1.118126e-01,0
+0049_alanine_unlabeled_positive_M+H90p0550_13p41,,1.368701e+01,,,1.368701e+01,1.368701e+01,1.368701e+01,1.368701e+01,,,0
+0050_sucrose_unlabeled_positive_M+H343p1235_13p45,,,1.334303e+01,1.331463e+01,1.332883e+01,1.332883e+01,1.331463e+01,1.334303e+01,2.008078e-02,1.419926e-02,0
+0051_threonine_unlabeled_positive_M+H120p0655_13p49,1.346224e+01,1.345579e+01,1.345708e+01,1.346524e+01,1.346009e+01,1.345966e+01,1.345579e+01,1.346524e+01,4.423994e-03,2.211997e-03,0
+0052_cis-4-hydroxy-proline_unlabeled_positive_M+H132p0655_13p67,1.322753e+01,1.322201e+01,1.325995e+01,1.327736e+01,1.324671e+01,1.324374e+01,1.322201e+01,1.327736e+01,2.641386e-02,1.320693e-02,0
+0053_4-guanidinobutanoic_acid_unlabeled_positive_M+H146p0924_13p86,1.385556e+01,1.386021e+01,1.391799e+01,1.391241e+01,1.388654e+01,1.388631e+01,1.385556e+01,1.391799e+01,3.322648e-02,1.661324e-02,0
+0054_maltose_unlabeled_positive_M+Na365p1054_14p07,1.365063e+01,1.367596e+01,1.363805e+01,1.397811e+01,1.373569e+01,1.366329e+01,1.363805e+01,1.397811e+01,1.623827e-01,8.119134e-02,0
+0055_serine_unlabeled_positive_M+H106p0499_14p31,1.432981e+01,1.433325e+01,1.432686e+01,1.432437e+01,1.432857e+01,1.432834e+01,1.432437e+01,1.433325e+01,3.833052e-03,1.916526e-03,0
+0056_glutamine_unlabeled_positive_M+H147p0764_14p31,1.431424e+01,1.430630e+01,1.432686e+01,1.433612e+01,1.432088e+01,1.432055e+01,1.430630e+01,1.433612e+01,1.322557e-02,6.612787e-03,0
+0057_asparagine_unlabeled_positive_M+H133p0608_14p37,1.436136e+01,1.435567e+01,1.435959e+01,1.437167e+01,1.436207e+01,1.436048e+01,1.435567e+01,1.437167e+01,6.824157e-03,3.412078e-03,0
+0058_gamma-Aminobutyric_acid_unlabeled_positive_M+H104p0706_14p39,1.437687e+01,1.436686e+01,1.440800e+01,1.441325e+01,1.439125e+01,1.439244e+01,1.436686e+01,1.441325e+01,2.285227e-02,1.142613e-02,0
 0059_alpha-ketoglutaric_acid_unlabeled_positive_M+H147p0288_14p51,,,,,,,,,,,0
-0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,14.704928398132324,14.712645530700684,14.6831636428833,14.687517166137695,14.697063684463501,14.69622278213501,14.6831636428833,14.712645530700684,0.014011838106020863,0.007005919053010431,0
-0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,14.579874992370605,14.584083557128906,14.537906646728516,14.538466453552246,14.560082912445068,14.559170722961426,14.537906646728516,14.584083557128906,0.025343082088378755,0.012671541044189378,0
-0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,,14.561990737915039,14.634858131408691,14.699416160583496,14.632088343302408,14.634858131408691,14.561990737915039,14.699416160583496,0.06875456707387731,0.03969546780811925,0
-0063_citrulline_unlabeled_positive_M+H176p1030_15p09,15.128581047058105,15.128291130065918,15.154582023620605,15.162788391113281,15.143560647964478,15.141581535339355,15.128291130065918,15.162788391113281,0.017783170397114904,0.008891585198557452,0
-0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,15.159500122070312,15.161605834960938,15.219368934631348,15.223294258117676,15.190942287445068,15.190487384796143,15.159500122070312,15.223294258117676,0.03513764118653772,0.01756882059326886,0
-0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,15.144072532653809,15.128291130065918,15.105835914611816,15.109355926513672,15.121888875961304,15.118823528289795,15.105835914611816,15.144072532653809,0.017775225003882934,0.008887612501941467,0
-0066_raffinose_unlabeled_positive_M+H505p1763_15p53,15.555009841918945,15.555731773376465,15.516404151916504,15.532966613769531,15.540028095245361,15.543988227844238,15.516404151916504,15.555731773376465,0.018964998104185705,0.009482499052092853,0
-0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,16.00358009338379,16.012876510620117,16.00017547607422,16.01028060913086,16.006728172302246,16.006930351257324,16.00017547607422,16.012876510620117,0.005867142979506786,0.002933571489753393,0
-0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,16.245359420776367,16.244237899780273,16.233556747436523,16.237499237060547,16.240163326263428,16.24086856842041,16.233556747436523,16.245359420776367,0.00560790521224537,0.002803952606122685,0
-0069_arginine_unlabeled_positive_M+H175p1190_16p94,16.963918685913086,16.961685180664062,16.988910675048828,16.9918212890625,16.97658395767212,16.976414680480957,16.961685180664062,16.9918212890625,0.01598443924833885,0.007992219624169425,0
-0070_lysine_unlabeled_positive_M+H147p1128_17p01,17.043212890625,17.035064697265625,17.05360221862793,17.05995750427246,17.047959327697754,17.048407554626465,17.035064697265625,17.05995750427246,0.011024194876641502,0.005512097438320751,0
-0071_ornithine_unlabeled_positive_M+H133p0972_17p04,17.058874130249023,17.06319236755371,17.085508346557617,17.07795524597168,17.071382522583008,17.070573806762695,17.058874130249023,17.085508346557617,0.0124669979531353,0.00623349897656765,0"""
+0060_mannosamine_unlabeled_positive_M+H180p0866_14p52,1.470493e+01,1.471265e+01,1.468316e+01,1.468752e+01,1.469706e+01,1.469622e+01,1.468316e+01,1.471265e+01,1.401184e-02,7.005919e-03,0
+0061_cysteic_acid_unlabeled_positive_M+H170p0118_14p54,1.457987e+01,1.458408e+01,1.453791e+01,1.453847e+01,1.456008e+01,1.455917e+01,1.453791e+01,1.458408e+01,2.534308e-02,1.267154e-02,0
+0062_N-acetyl-aspartic_acid_unlabeled_positive_M+H176p0553_14p82,,1.456199e+01,1.463486e+01,1.469942e+01,1.463209e+01,1.463486e+01,1.456199e+01,1.469942e+01,6.875457e-02,3.969547e-02,0
+0063_citrulline_unlabeled_positive_M+H176p1030_15p09,1.512858e+01,1.512829e+01,1.515458e+01,1.516279e+01,1.514356e+01,1.514158e+01,1.512829e+01,1.516279e+01,1.778317e-02,8.891585e-03,0
+0064_N-alpha-acetyl-lysine_unlabeled_positive_M+H189p1234_15p13,1.515950e+01,1.516161e+01,1.521937e+01,1.522329e+01,1.519094e+01,1.519049e+01,1.515950e+01,1.522329e+01,3.513764e-02,1.756882e-02,0
+0065_N-acetyl-glutamic_acid_unlabeled_positive_M+H190p0710_15p16,1.514407e+01,1.512829e+01,1.510584e+01,1.510936e+01,1.512189e+01,1.511882e+01,1.510584e+01,1.514407e+01,1.777523e-02,8.887613e-03,0
+0066_raffinose_unlabeled_positive_M+H505p1763_15p53,1.555501e+01,1.555573e+01,1.551640e+01,1.553297e+01,1.554003e+01,1.554399e+01,1.551640e+01,1.555573e+01,1.896500e-02,9.482499e-03,0
+0067_glutamic_acid_unlabeled_positive_M+H148p0604_15p94,1.600358e+01,1.601288e+01,1.600018e+01,1.601028e+01,1.600673e+01,1.600693e+01,1.600018e+01,1.601288e+01,5.867143e-03,2.933571e-03,0
+0068_Aspartic_acid_unlabeled_positive_M+H134p0448_16p13,1.624536e+01,1.624424e+01,1.623356e+01,1.623750e+01,1.624016e+01,1.624087e+01,1.623356e+01,1.624536e+01,5.607905e-03,2.803953e-03,0
+0069_arginine_unlabeled_positive_M+H175p1190_16p94,1.696392e+01,1.696169e+01,1.698891e+01,1.699182e+01,1.697658e+01,1.697641e+01,1.696169e+01,1.699182e+01,1.598444e-02,7.992220e-03,0
+0070_lysine_unlabeled_positive_M+H147p1128_17p01,1.704321e+01,1.703506e+01,1.705360e+01,1.705996e+01,1.704796e+01,1.704841e+01,1.703506e+01,1.705996e+01,1.102419e-02,5.512097e-03,0
+0071_ornithine_unlabeled_positive_M+H133p0972_17p04,1.705887e+01,1.706319e+01,1.708551e+01,1.707796e+01,1.707138e+01,1.707057e+01,1.705887e+01,1.708551e+01,1.246700e-02,6.233499e-03,0"""
+
     command = """\
                     jq -M '(.cells[] | select(.source[] | contains("predict_rt.generate_outputs(ids, max_cpus, metatlas_repo_path)")).source) \
                                 = ["predict_rt.generate_outputs(ids, max_cpus, metatlas_repo_path, save_to_db=False, model_only=True)"]' \
diff --git a/tests/system/test_targeted.py b/tests/system/test_targeted.py
index 50a5a6df..ea9dd307 100644
--- a/tests/system/test_targeted.py
+++ b/tests/system/test_targeted.py
@@ -4,39 +4,31 @@
 
 
 def test_targeted_by_line01_with_remove(tmp_path):
-    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.2"
+    image = "registry.spin.nersc.gov/metatlas_test/metatlas_ci01:v1.4.4"
     experiment = "20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583"
     expected = {}
     expected[
         str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_peak_height.tab")
-    ] = "\n".join(
-        [
-            f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",  # noqa: E501
-            f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",  # noqa: E501
-            "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
-            "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
-            "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-            "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-            "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t304761.90625\t416788.03125\t837662.0625\t2359861.25",
-            "0001_adenine_positive_M+H136p0618_2p52\t1594753.875\t12096485.0\t51774956.0\t91955488.0",
-            "0002_adenosine_positive_M+H268p1041_3p02\t26611868.0\t119774184.0\t267718880.0\t473905024.0",
-        ]
-    )
+    ] = """group	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S2	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S3	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S4
+file	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5
+short groupname	POS_Cone-S1	POS_Cone-S2	POS_Cone-S3	POS_Cone-S4
+sample treatment	Cone-S1	Cone-S2	Cone-S3	Cone-S4
+short filename	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1
+short samplename	POS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1
+0000_2deoxyadenosine_positive_M+H252p1091_2p20	3.047619062e+05	4.167880312e+05	8.376620625e+05	2.359861250e+06
+0001_adenine_positive_M+H136p0618_2p52	1.594753875e+06	1.209648500e+07	5.177495600e+07	9.195548800e+07
+0002_adenosine_positive_M+H268p1041_3p02	2.661186800e+07	1.197741840e+08	2.677188800e+08	4.739050240e+08"""
     expected[
         str(tmp_path / experiment / "root0/FinalEMA-HILIC/POS/POS_data_sheets/POS_rt_peak.tab")
-    ] = "\n".join(
-        [
-            f"group\t{experiment}_POS_MSMS_root0_Cone-S1\t{experiment}_POS_MSMS_root0_Cone-S2\t{experiment}_POS_MSMS_root0_Cone-S3\t{experiment}_POS_MSMS_root0_Cone-S4",
-            f"file\t{experiment}_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5\t{experiment}_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5\t{experiment}_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5\t{experiment}_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5",
-            "short groupname\tPOS_Cone-S1\tPOS_Cone-S2\tPOS_Cone-S3\tPOS_Cone-S4",
-            "sample treatment\tCone-S1\tCone-S2\tCone-S3\tCone-S4",
-            "short filename\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1\t20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-            "short samplename\tPOS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1\tPOS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1",  # noqa: E501
-            "0000_2deoxyadenosine_positive_M+H252p1091_2p20\t2.2775044441223145\t2.2806363105773926\t2.2833268642425537\t2.2922415733337402",
-            "0001_adenine_positive_M+H136p0618_2p52\t2.6164748668670654\t2.639369249343872\t2.6182913780212402\t2.657374620437622",
-            "0002_adenosine_positive_M+H268p1041_3p02\t3.098848819732666\t3.1250929832458496\t3.1176068782806396\t3.139331817626953",
-        ]
-    )
+    ] = """group	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S1	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S2	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S3	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_root0_Cone-S4
+file	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_49_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run34.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_57_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run40.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_65_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run16.h5	20201106_JGI-AK_PS-KM_505892_OakGall_final_QE-HF_HILICZ_USHXG01583_POS_MSMS_73_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1_Run31.h5
+short groupname	POS_Cone-S1	POS_Cone-S2	POS_Cone-S3	POS_Cone-S4
+sample treatment	Cone-S1	Cone-S2	Cone-S3	Cone-S4
+short filename	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1	20201106_PS-KM_OakGall_final_HILICZ_POS_Rg70to1050-CE102040-QlobataAkingi-S1
+short samplename	POS_Cone-S1_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S2_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S3_1_Rg70to1050-CE102040-QlobataAkingi-S1	POS_Cone-S4_1_Rg70to1050-CE102040-QlobataAkingi-S1
+0000_2deoxyadenosine_positive_M+H252p1091_2p20	2.277504444e+00	2.280636311e+00	2.283326864e+00	2.292241573e+00
+0001_adenine_positive_M+H136p0618_2p52	2.616474867e+00	2.639369249e+00	2.618291378e+00	2.657374620e+00
+0002_adenosine_positive_M+H268p1041_3p02	3.098848820e+00	3.125092983e+00	3.117606878e+00	3.139331818e+00"""
     command = """\
                     jq -M '(.cells[] | select(.source[] | contains("compound_idx=0")).source) \
                                += ["\\n", \