Merge branch 'develop' of github.com:LewisResearchGroup/ms-mint into …

…develop
LewisResearchGroup · Apr 19, 2023 · 2197a4d · 2197a4d
2 parents 7107a0d + 94e3644
commit 2197a4d
Show file tree

Hide file tree

Showing 22 changed files with 341 additions and 386 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ The `ms-mint` library includes a range of functions for processing LCMS data fro
 ## Documentation
 The code documentation can be accessed [here](https://lewisresearchgroup.github.io/ms-mint/readme.html).
 
-## News
+## ms-mint application with graphical user iterface (GUI)
 MINT has been split into the Python library and the app. This repository contains the Python library. For the app follow [this link](https://github.com/LewisResearchGroup/ms-mint-app).
 
 ## Contributions
@@ -24,7 +24,7 @@ The project follows PEP8 standard and uses Black and Flake8 to ensure a consiste
 
 ---
 
-# Usage
+## Usage
 
 To use the main class of the ms-mint library, you can import it into your code with the following command:
 
@@ -145,7 +145,6 @@ Then we apply the changes and plot the new peak shapes:
 
 As you can see, the shapes of Xanthine, Succinate, Citrulline look much better.
 
-
 ## Plotting and data exploration
 
 The `Mint` class has a few convenient methods to visualize and explore the processed data. The results can be viewed directly in JupyterLab or stored to files using `matplotlib` and `seaborn` syntax. The figures are matplotlib objects and can be easily customised. 
@@ -158,7 +157,6 @@ The `Mint` class has a few convenient methods to visualize and explore the proce
 
 The method uses seaborn [sns.relplot()](https://seaborn.pydata.org/generated/seaborn.relplot.html) function and keyword arguments are passed on. 
 
-
 ## Hierarchical clustering
 Mint ca be used to cluster the extracted data. An agglomerative hierarchical clustering is a bottom-up clustering technique where each data point starts as its own cluster and then are merged with other clusters in a hierarchical manner based on their proximity or similarity until a single cluster is formed or a specified stopping criteria is met. The proximity is usually determined by a distance metric, such as Euclidean distance, and the similarity is usually determined by a linkage function, such as ward linkage or complete linkage. The result is a tree-like representation called a dendrogram, which can be used to determine the number of clusters and the cluster assignments of the data points.
 
@@ -188,7 +186,6 @@ Before clustering the data can be transformed and scaled. By default `log2p1(x)
         cmap=None  # Name of a matplotlib color map                
     )
 
-
 ![](notebooks/hierarchical_clustering.png)
 
 ## Principal Components Analysis
@@ -205,8 +202,8 @@ After running the PCA the results can be plotted with:
 
 ![](notebooks/pca-pairplot.png)
 
-# FAQ
-## What is a target list
+## FAQ
+### What is a target list
 A target list is a pandas dataframe with specific columns. 
 
 -   **peak_label**: string, Label of the peak (must be unique).
@@ -221,11 +218,11 @@ A target list is a pandas dataframe with specific columns.
 
 The target list can be stored as csv or Excel file. 
 
-## What input files can be used
+### What input files can be used
 `ms_mint` can be used with `mzXML`, `mzML`, `mzMLb` and experimental formats in `.feather` and `.parquet` format.
 
-## Which properties does ms-mint extract
-### Parameters from target list
+### Which properties does ms-mint extract
+#### Parameters from target list
 -   **ms_file**: Filename of MS-file
 -   **peak_label**: From target list
 -   **mz_mean**: From target list
@@ -239,7 +236,7 @@ The target list can be stored as csv or Excel file.
 
 ---
 
-### Results columns
+#### Results columns
 -   **peak_area**: The sum of all intensities in the extraction window
 -   **peak_area_top3**: The average of the 3 largest intensities in the extraction window
 -   **peak_n_datapoints**: Number of datapoints
@@ -260,9 +257,7 @@ The target list can be stored as csv or Excel file.
 -   **ms_file_size**: Size of the MS-file in MB
 ---
 
+## Release Notes
 
-# Release Notes
-
-## 0.2.0 Milestones
+### 0.2.0 Milestones
     - peak_area_top3 comparable to El-Maven PeakAreaTop values
-
diff --git a/ms_mint/Mint.py b/ms_mint/Mint.py
@@ -22,7 +22,8 @@
 )
 from .pca import PrincipalComponentsAnalyser
 from .plotting import MintPlotter
-#from .filter import Resampler
+
+# from .filter import Resampler
 from .chromatogram import Chromatogram, extract_chromatogram_from_ms1
 
 import ms_mint
@@ -45,11 +46,15 @@ class Mint(object):
 
     """
 
-    def __init__(self, verbose: bool = False, progress_callback=None, time_unit="s"):
-
-        self._verbose = verbose
+    def __init__(
+        self,
+        verbose: bool = False,
+        progress_callback: Callable = None,
+        time_unit: str = "s",
+    ):
+        self.verbose = verbose
         self._version = ms_mint.__version__
-        self._progress_callback = progress_callback
+        self.progress_callback = progress_callback
         self.reset()
         if self.verbose:
             print("Mint Version:", self.version, "\n")
@@ -59,24 +64,6 @@ def __init__(self, verbose: bool = False, progress_callback=None, time_unit="s")
         self.tqdm = tqdm
         self.meta = None
 
-    @property
-    def verbose(self):
-        """
-        Get/set verbosity.
-
-        :getter: Get current verbosity.
-        :return: True or False
-        :rtype: bool
-        :setter: Sets verbosity.
-        :param value: True or False
-        :type value: bool
-        """
-        return self._verbose
-
-    @verbose.setter
-    def verbose(self, value: bool):
-        self._verbose = value
-
     @property
     def version(self):
         """
@@ -133,11 +120,11 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs):
                 * 1: Run without multiprocessing on one CPU
                 * >1: Run with multiprocessing enabled using nthreads threads.
         :param mode: Compute mode ('standard' or 'express'), defaults to 'standard'
+        :type mode: str
                 * 'standard': calculates peak shaped projected to RT dimension
                 * 'express': omits calculation of other features, only peak_areas
         :param fn: Output filename to not keep results in memory.
         :type fn: str
-        :type mode: str
         :param kwargs: Arguments passed to the procesing function.
         """
         self._status = "running"
@@ -146,39 +133,57 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs):
             return None
 
         targets = self.targets.reset_index()
+        self._set_rt_min_max(targets, rt_margin)
 
-        if "rt" in targets.columns:
-            ndx = (targets.rt_min.isna()) & (~targets.rt.isna())
-            targets.loc[ndx, "rt_min"] = targets.loc[ndx, "rt"] - rt_margin
-            ndx = (targets.rt_max.isna()) & (~targets.rt.isna())
-            targets.loc[ndx, "rt_max"] = targets.loc[ndx, "rt"] + rt_margin
-            del ndx
-
-        if nthreads is None:
-            nthreads = min(cpu_count(), self.n_files)
+        nthreads = self._determine_nthreads(nthreads)
 
         if self.verbose:
             print(f"Run MINT with {nthreads} processes:")
 
         start = time.time()
         if nthreads > 1:
-            self.__run_parallel__(nthreads=nthreads, mode=mode, fn=fn, **kwargs)
+            self._run_parallel(nthreads=nthreads, mode=mode, fn=fn, **kwargs)
         else:
-            results = []
-            for i, filename in enumerate(self.ms_files):
-                args = {
-                    "filename": filename,
-                    "targets": targets,
-                    "q": None,
-                    "mode": mode,
-                    "output_fn": None,
-                }
-                results.append(process_ms1_files_in_parallel(args))
-                self.progress = int(100 * (i / self.n_files))
-            self.results = pd.concat(results).reset_index(drop=True)
+            self._run_sequential(mode=mode, fn=fn, targets=targets)
 
         self.progress = 100
+        self._report_runtime(start)
+
+        self._status = "done"
+        assert self.progress == 100
+        return self
 
+    def _set_rt_min_max(self, targets, rt_margin):
+        if "rt" in targets.columns:
+            update_rt_min = (targets.rt_min.isna()) & (~targets.rt.isna())
+            targets.loc[update_rt_min, "rt_min"] = (
+                targets.loc[update_rt_min, "rt"] - rt_margin
+            )
+            update_rt_max = (targets.rt_max.isna()) & (~targets.rt.isna())
+            targets.loc[update_rt_max, "rt_max"] = (
+                targets.loc[update_rt_max, "rt"] + rt_margin
+            )
+
+    def _determine_nthreads(self, nthreads):
+        if nthreads is None:
+            nthreads = min(cpu_count(), self.n_files)
+        return nthreads
+
+    def _run_sequential(self, mode, fn, targets):
+        results = []
+        for i, filename in enumerate(self.ms_files):
+            args = {
+                "filename": filename,
+                "targets": targets,
+                "q": None,
+                "mode": mode,
+                "output_fn": None,
+            }
+            results.append(process_ms1_files_in_parallel(args))
+            self.progress = int(100 * (i / self.n_files))
+        self.results = pd.concat(results).reset_index(drop=True)
+
+    def _report_runtime(self, start):
         end = time.time()
         self.runtime = end - start
         self.runtime_per_file = self.runtime / self.n_files
@@ -192,11 +197,7 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs):
             )
             print("Results:", self.results)
 
-        self._status = "done"
-        assert self.progress == 100
-        return self
-
-    def __run_parallel__(
+    def _run_parallel(
         self, nthreads=1, mode="standard", maxtasksperchild=None, fn=None
     ):
         print(f"maxtasksperchild: {maxtasksperchild}")
@@ -221,17 +222,7 @@ def __run_parallel__(
             )
 
         results = pool.map_async(process_ms1_files_in_parallel, args)
-
-        # monitor progress
-        while True:
-            if results.ready():
-                break
-            else:
-                size = q.qsize()
-                self.progress = 100 * size / self.n_files
-                time.sleep(1)
-
-        self.progress = 100
+        self._monitor_progress(results, q)
 
         pool.close()
         pool.join()
@@ -240,6 +231,13 @@ def __run_parallel__(
             results = results.get()
             self.results = pd.concat(results).reset_index(drop=True)
 
+    def _monitor_progress(self, results, q):
+        while not results.ready():
+            size = q.qsize()
+            self.progress = 100 * size / self.n_files
+            time.sleep(1)
+        self.progress = 100
+
     @property
     def status(self):
         """
@@ -398,20 +396,6 @@ def crosstab(self, col_name="peak_max", apply=None, scaler=None):
             df = scale_dataframe(df, scaler=scaler)
         return df
 
-    @property
-    def progress_callback(self):
-        """
-        Assigns a callback function to update a progress bar.
-
-        :getter: Returns the current callback function.
-        :setter: Sets the callback function.
-        """
-        return self._progress_callback
-
-    @progress_callback.setter
-    def progress_callback(self, func: Callable = None):
-        self._progress_callback = func
-
     @property
     def progress(self):
         """
@@ -501,7 +485,7 @@ def get_chromatograms(self, fns=None, peak_label=None, **kwargs):
             peak_label = self.peak_labels
 
         data = []
-        
+
         for fn in self.tqdm(fns):
             df = ms_file_to_df(fn)
             for label in peak_label:
@@ -517,7 +501,7 @@ def get_chromatograms(self, fns=None, peak_label=None, **kwargs):
                 chrom_data["ms_file"] = fn
                 chrom_data["peak_label"] = label
                 chrom_data["rt_min"] = rt_min
-                chrom_data["rt_max"] = rt_min
+                chrom_data["rt_max"] = rt_max
                 data.append(chrom_data)
 
         data = pd.concat(data).reset_index()

diff --git a/ms_mint/_version.py b/ms_mint/_version.py
@@ -87,8 +87,8 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=
     assert isinstance(commands, list)
     p = None
     for c in commands:
+        dispcmd = str([c] + args)
         try:
-            dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
             p = subprocess.Popen(
                 [c] + args,

diff --git a/ms_mint/chromatogram.py b/ms_mint/chromatogram.py
@@ -6,13 +6,13 @@
 
 from .tools import find_peaks_in_timeseries, gaussian, mz_mean_width_to_min_max
 from .io import ms_file_to_df
-from .filter import Resampler, Smoother, GaussFilter
+from .filters import Resampler, Smoother, GaussFilter
 from .matplotlib_tools import plot_peaks
 
 
 class Chromatogram:
     def __init__(
-        self, scan_times=None, intensities=None, filter=None, expected_rt=None
+        self, scan_times=None, intensities=None, filters=None, expected_rt=None
     ):
         self.t = np.array([0])
         self.x = np.array([0])
@@ -21,10 +21,10 @@ def __init__(
         if intensities is not None:
             self.x = np.append(self.x, intensities)
         self.noise_level = None
-        if filter is None:
-            self.filter = [Resampler(), GaussFilter(), Smoother()]
+        if filters is None:
+            self.filters = [Resampler(), GaussFilter(), Smoother()]
         else:
-            self.filter = filter
+            self.filters = filters
         self.peaks = None
         self.selected_peak_ndxs = None
         if expected_rt is None and scan_times is not None:
@@ -43,8 +43,8 @@ def estimate_noise_level(self, window=20):
         data = pd.Series(index=self.t, data=self.x)
         self.noise_level = data.rolling(window, center=True).std().median()
 
-    def apply_filter(self):
-        for filt in self.filter:
+    def apply_filters(self):
+        for filt in self.filters:
             self.t, self.x = filt.transform(self.t, self.x)
 
     def find_peaks(self, prominence=None, rel_height=0.9):
@@ -80,7 +80,6 @@ def optimise_peak_times_with_diff(self, rolling_window=20, plot=False):
             plt.show()
 
         for ndx, row in peaks.iterrows():
-
             new_rt_min = row.rt_min
             new_rt_max = row.rt_max
 
@@ -130,7 +129,7 @@ def select_peak_with_gaussian_weight(self, expected_rt=None, sigma=50):
 
     @property
     def selected_peaks(self):
-        self.peaks.loc[self.selected_peak_ndxs]
+        return self.peaks.loc[self.selected_peak_ndxs]
 
     @property
     def data(self):