diff --git a/README.md b/README.md index 902175df..c87d519a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The `ms-mint` library includes a range of functions for processing LCMS data fro ## Documentation The code documentation can be accessed [here](https://lewisresearchgroup.github.io/ms-mint/readme.html). -## News +## ms-mint application with graphical user iterface (GUI) MINT has been split into the Python library and the app. This repository contains the Python library. For the app follow [this link](https://github.com/LewisResearchGroup/ms-mint-app). ## Contributions @@ -24,7 +24,7 @@ The project follows PEP8 standard and uses Black and Flake8 to ensure a consiste --- -# Usage +## Usage To use the main class of the ms-mint library, you can import it into your code with the following command: @@ -145,7 +145,6 @@ Then we apply the changes and plot the new peak shapes: As you can see, the shapes of Xanthine, Succinate, Citrulline look much better. - ## Plotting and data exploration The `Mint` class has a few convenient methods to visualize and explore the processed data. The results can be viewed directly in JupyterLab or stored to files using `matplotlib` and `seaborn` syntax. The figures are matplotlib objects and can be easily customised. @@ -158,7 +157,6 @@ The `Mint` class has a few convenient methods to visualize and explore the proce The method uses seaborn [sns.relplot()](https://seaborn.pydata.org/generated/seaborn.relplot.html) function and keyword arguments are passed on. - ## Hierarchical clustering Mint ca be used to cluster the extracted data. An agglomerative hierarchical clustering is a bottom-up clustering technique where each data point starts as its own cluster and then are merged with other clusters in a hierarchical manner based on their proximity or similarity until a single cluster is formed or a specified stopping criteria is met. The proximity is usually determined by a distance metric, such as Euclidean distance, and the similarity is usually determined by a linkage function, such as ward linkage or complete linkage. The result is a tree-like representation called a dendrogram, which can be used to determine the number of clusters and the cluster assignments of the data points. @@ -188,7 +186,6 @@ Before clustering the data can be transformed and scaled. By default `log2p1(x) cmap=None # Name of a matplotlib color map ) - ![](notebooks/hierarchical_clustering.png) ## Principal Components Analysis @@ -205,8 +202,8 @@ After running the PCA the results can be plotted with: ![](notebooks/pca-pairplot.png) -# FAQ -## What is a target list +## FAQ +### What is a target list A target list is a pandas dataframe with specific columns. - **peak_label**: string, Label of the peak (must be unique). @@ -221,11 +218,11 @@ A target list is a pandas dataframe with specific columns. The target list can be stored as csv or Excel file. -## What input files can be used +### What input files can be used `ms_mint` can be used with `mzXML`, `mzML`, `mzMLb` and experimental formats in `.feather` and `.parquet` format. -## Which properties does ms-mint extract -### Parameters from target list +### Which properties does ms-mint extract +#### Parameters from target list - **ms_file**: Filename of MS-file - **peak_label**: From target list - **mz_mean**: From target list @@ -239,7 +236,7 @@ The target list can be stored as csv or Excel file. --- -### Results columns +#### Results columns - **peak_area**: The sum of all intensities in the extraction window - **peak_area_top3**: The average of the 3 largest intensities in the extraction window - **peak_n_datapoints**: Number of datapoints @@ -260,9 +257,7 @@ The target list can be stored as csv or Excel file. - **ms_file_size**: Size of the MS-file in MB --- +## Release Notes -# Release Notes - -## 0.2.0 Milestones +### 0.2.0 Milestones - peak_area_top3 comparable to El-Maven PeakAreaTop values - diff --git a/ms_mint/Mint.py b/ms_mint/Mint.py index 5be7374d..ff3a32a3 100644 --- a/ms_mint/Mint.py +++ b/ms_mint/Mint.py @@ -22,7 +22,8 @@ ) from .pca import PrincipalComponentsAnalyser from .plotting import MintPlotter -#from .filter import Resampler + +# from .filter import Resampler from .chromatogram import Chromatogram, extract_chromatogram_from_ms1 import ms_mint @@ -45,11 +46,15 @@ class Mint(object): """ - def __init__(self, verbose: bool = False, progress_callback=None, time_unit="s"): - - self._verbose = verbose + def __init__( + self, + verbose: bool = False, + progress_callback: Callable = None, + time_unit: str = "s", + ): + self.verbose = verbose self._version = ms_mint.__version__ - self._progress_callback = progress_callback + self.progress_callback = progress_callback self.reset() if self.verbose: print("Mint Version:", self.version, "\n") @@ -59,24 +64,6 @@ def __init__(self, verbose: bool = False, progress_callback=None, time_unit="s") self.tqdm = tqdm self.meta = None - @property - def verbose(self): - """ - Get/set verbosity. - - :getter: Get current verbosity. - :return: True or False - :rtype: bool - :setter: Sets verbosity. - :param value: True or False - :type value: bool - """ - return self._verbose - - @verbose.setter - def verbose(self, value: bool): - self._verbose = value - @property def version(self): """ @@ -133,11 +120,11 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs): * 1: Run without multiprocessing on one CPU * >1: Run with multiprocessing enabled using nthreads threads. :param mode: Compute mode ('standard' or 'express'), defaults to 'standard' + :type mode: str * 'standard': calculates peak shaped projected to RT dimension * 'express': omits calculation of other features, only peak_areas :param fn: Output filename to not keep results in memory. :type fn: str - :type mode: str :param kwargs: Arguments passed to the procesing function. """ self._status = "running" @@ -146,39 +133,57 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs): return None targets = self.targets.reset_index() + self._set_rt_min_max(targets, rt_margin) - if "rt" in targets.columns: - ndx = (targets.rt_min.isna()) & (~targets.rt.isna()) - targets.loc[ndx, "rt_min"] = targets.loc[ndx, "rt"] - rt_margin - ndx = (targets.rt_max.isna()) & (~targets.rt.isna()) - targets.loc[ndx, "rt_max"] = targets.loc[ndx, "rt"] + rt_margin - del ndx - - if nthreads is None: - nthreads = min(cpu_count(), self.n_files) + nthreads = self._determine_nthreads(nthreads) if self.verbose: print(f"Run MINT with {nthreads} processes:") start = time.time() if nthreads > 1: - self.__run_parallel__(nthreads=nthreads, mode=mode, fn=fn, **kwargs) + self._run_parallel(nthreads=nthreads, mode=mode, fn=fn, **kwargs) else: - results = [] - for i, filename in enumerate(self.ms_files): - args = { - "filename": filename, - "targets": targets, - "q": None, - "mode": mode, - "output_fn": None, - } - results.append(process_ms1_files_in_parallel(args)) - self.progress = int(100 * (i / self.n_files)) - self.results = pd.concat(results).reset_index(drop=True) + self._run_sequential(mode=mode, fn=fn, targets=targets) self.progress = 100 + self._report_runtime(start) + + self._status = "done" + assert self.progress == 100 + return self + def _set_rt_min_max(self, targets, rt_margin): + if "rt" in targets.columns: + update_rt_min = (targets.rt_min.isna()) & (~targets.rt.isna()) + targets.loc[update_rt_min, "rt_min"] = ( + targets.loc[update_rt_min, "rt"] - rt_margin + ) + update_rt_max = (targets.rt_max.isna()) & (~targets.rt.isna()) + targets.loc[update_rt_max, "rt_max"] = ( + targets.loc[update_rt_max, "rt"] + rt_margin + ) + + def _determine_nthreads(self, nthreads): + if nthreads is None: + nthreads = min(cpu_count(), self.n_files) + return nthreads + + def _run_sequential(self, mode, fn, targets): + results = [] + for i, filename in enumerate(self.ms_files): + args = { + "filename": filename, + "targets": targets, + "q": None, + "mode": mode, + "output_fn": None, + } + results.append(process_ms1_files_in_parallel(args)) + self.progress = int(100 * (i / self.n_files)) + self.results = pd.concat(results).reset_index(drop=True) + + def _report_runtime(self, start): end = time.time() self.runtime = end - start self.runtime_per_file = self.runtime / self.n_files @@ -192,11 +197,7 @@ def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs): ) print("Results:", self.results) - self._status = "done" - assert self.progress == 100 - return self - - def __run_parallel__( + def _run_parallel( self, nthreads=1, mode="standard", maxtasksperchild=None, fn=None ): print(f"maxtasksperchild: {maxtasksperchild}") @@ -221,17 +222,7 @@ def __run_parallel__( ) results = pool.map_async(process_ms1_files_in_parallel, args) - - # monitor progress - while True: - if results.ready(): - break - else: - size = q.qsize() - self.progress = 100 * size / self.n_files - time.sleep(1) - - self.progress = 100 + self._monitor_progress(results, q) pool.close() pool.join() @@ -240,6 +231,13 @@ def __run_parallel__( results = results.get() self.results = pd.concat(results).reset_index(drop=True) + def _monitor_progress(self, results, q): + while not results.ready(): + size = q.qsize() + self.progress = 100 * size / self.n_files + time.sleep(1) + self.progress = 100 + @property def status(self): """ @@ -398,20 +396,6 @@ def crosstab(self, col_name="peak_max", apply=None, scaler=None): df = scale_dataframe(df, scaler=scaler) return df - @property - def progress_callback(self): - """ - Assigns a callback function to update a progress bar. - - :getter: Returns the current callback function. - :setter: Sets the callback function. - """ - return self._progress_callback - - @progress_callback.setter - def progress_callback(self, func: Callable = None): - self._progress_callback = func - @property def progress(self): """ @@ -501,7 +485,7 @@ def get_chromatograms(self, fns=None, peak_label=None, **kwargs): peak_label = self.peak_labels data = [] - + for fn in self.tqdm(fns): df = ms_file_to_df(fn) for label in peak_label: @@ -517,7 +501,7 @@ def get_chromatograms(self, fns=None, peak_label=None, **kwargs): chrom_data["ms_file"] = fn chrom_data["peak_label"] = label chrom_data["rt_min"] = rt_min - chrom_data["rt_max"] = rt_min + chrom_data["rt_max"] = rt_max data.append(chrom_data) data = pd.concat(data).reset_index() diff --git a/ms_mint/_version.py b/ms_mint/_version.py index d6fc4576..d1995468 100644 --- a/ms_mint/_version.py +++ b/ms_mint/_version.py @@ -87,8 +87,8 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= assert isinstance(commands, list) p = None for c in commands: + dispcmd = str([c] + args) try: - dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen( [c] + args, diff --git a/ms_mint/chromatogram.py b/ms_mint/chromatogram.py index 5397a567..74d6abe1 100644 --- a/ms_mint/chromatogram.py +++ b/ms_mint/chromatogram.py @@ -6,13 +6,13 @@ from .tools import find_peaks_in_timeseries, gaussian, mz_mean_width_to_min_max from .io import ms_file_to_df -from .filter import Resampler, Smoother, GaussFilter +from .filters import Resampler, Smoother, GaussFilter from .matplotlib_tools import plot_peaks class Chromatogram: def __init__( - self, scan_times=None, intensities=None, filter=None, expected_rt=None + self, scan_times=None, intensities=None, filters=None, expected_rt=None ): self.t = np.array([0]) self.x = np.array([0]) @@ -21,10 +21,10 @@ def __init__( if intensities is not None: self.x = np.append(self.x, intensities) self.noise_level = None - if filter is None: - self.filter = [Resampler(), GaussFilter(), Smoother()] + if filters is None: + self.filters = [Resampler(), GaussFilter(), Smoother()] else: - self.filter = filter + self.filters = filters self.peaks = None self.selected_peak_ndxs = None if expected_rt is None and scan_times is not None: @@ -43,8 +43,8 @@ def estimate_noise_level(self, window=20): data = pd.Series(index=self.t, data=self.x) self.noise_level = data.rolling(window, center=True).std().median() - def apply_filter(self): - for filt in self.filter: + def apply_filters(self): + for filt in self.filters: self.t, self.x = filt.transform(self.t, self.x) def find_peaks(self, prominence=None, rel_height=0.9): @@ -80,7 +80,6 @@ def optimise_peak_times_with_diff(self, rolling_window=20, plot=False): plt.show() for ndx, row in peaks.iterrows(): - new_rt_min = row.rt_min new_rt_max = row.rt_max @@ -130,7 +129,7 @@ def select_peak_with_gaussian_weight(self, expected_rt=None, sigma=50): @property def selected_peaks(self): - self.peaks.loc[self.selected_peak_ndxs] + return self.peaks.loc[self.selected_peak_ndxs] @property def data(self): diff --git a/ms_mint/filelock.py b/ms_mint/filelock.py index 14d910ca..8d07a6fa 100644 --- a/ms_mint/filelock.py +++ b/ms_mint/filelock.py @@ -50,15 +50,6 @@ except ImportError: fcntl = None - -# Backward compatibility -# ------------------------------------------------ -try: - TimeoutError -except NameError: - TimeoutError = OSError - - # Data # ------------------------------------------------ __all__ = [ @@ -105,6 +96,7 @@ def __str__(self): # Classes # ------------------------------------------------ + # This is a helper class which is returned by :meth:`BaseFileLock.acquire` # and wraps the lock to make sure __enter__ is not called twice when entering # the with statement. @@ -310,7 +302,6 @@ def release(self, force=False): every case. """ with self._thread_lock: - if self.is_locked: self._lock_counter -= 1 diff --git a/ms_mint/filter.py b/ms_mint/filters.py similarity index 96% rename from ms_mint/filter.py rename to ms_mint/filters.py index df4b7dd9..0e7416c8 100644 --- a/ms_mint/filter.py +++ b/ms_mint/filters.py @@ -49,7 +49,7 @@ class Smoother: averages. """ - def __init__(self, windows=[30, 20]): + def __init__(self, windows=None): """ Filter for time series that smoothes the x values by running one or more rolling @@ -58,6 +58,8 @@ def __init__(self, windows=[30, 20]): :param windows: Window sizes of rolling averages applied to time series, defaults to [30, 20] :type windows: : List[int], optional """ + if windows is None: + windows = [30, 20] self.windows = windows self.name = "smoother" diff --git a/ms_mint/io.py b/ms_mint/io.py index 9ad55bd8..58ba325d 100644 --- a/ms_mint/io.py +++ b/ms_mint/io.py @@ -6,7 +6,9 @@ import numpy as np import io import logging -import pymzml + +import pathlib +from typing import Union, Optional from pathlib import Path as P from datetime import date @@ -31,7 +33,7 @@ ] -def ms_file_to_df(fn, read_only: bool = False, time_unit="seconds"): +def ms_file_to_df(fn, read_only: bool = False): """ Read MS file and convert it to a pandas.DataFrame. @@ -39,20 +41,17 @@ def ms_file_to_df(fn, read_only: bool = False, time_unit="seconds"): :type fn: str or PosixPath :param read_only: Whether or not to apply convert to dataframe (for testing purposes), defaults to False :type read_only: bool, optional - :param time_unit: Time unit used in file, defaults to "seconds" - :type time_unit: str, ['seconds', 'minutes'] :return: MS data as DataFrame :rtype: pandas.DataFrame """ - assert time_unit in ["minutes", "seconds"] fn = str(fn) try: if fn.lower().endswith(".mzxml"): df = mzxml_to_df(fn, read_only=read_only) elif fn.lower().endswith(".mzml"): - df = mzml_to_df(fn, read_only=read_only, time_unit=time_unit) + df = mzml_to_df(fn, read_only=read_only) elif fn.lower().endswith("hdf"): df = pd.read_hdf(fn) elif fn.lower().endswith(".feather"): @@ -75,163 +74,138 @@ def ms_file_to_df(fn, read_only: bool = False, time_unit="seconds"): "m/z array": "mz", } ) + + set_dtypes(df) + + # assert df.scan_id.dtype in [np.int32, np.int64], df.scan_id.dtype + # assert df.intensity.dtype == np.int64, df.intensity.dtype + # assert df.mz.dtype == np.float64, df.mz.dtype + # assert df.scan_time.dtype == np.float64, df.scan_time.dtype + return df -def mzxml_to_df(fn, read_only=False): +def mzxml_to_df( + fn: Union[str, pathlib.Path], + read_only: bool = False, + time_unit_in_file: str = "min", +) -> Optional[pd.DataFrame]: """ Read mzXML file and convert it to pandas.DataFrame. :param fn: Filename - :type fn: str or PosixPath + :type fn: Union[str, pathlib.Path] :param read_only: Whether or not to convert to dataframe (for testing purposes), defaults to False :type read_only: bool, optional + :param time_unit_in_file: The time unit used in the mzXML file (either 'sec' or 'min'), defaults to 'min' + :type time_unit_in_file: str, optional :return: MS data - :rtype: pandas.DataFrame + :rtype: Optional[pd.DataFrame] """ + assert str(fn).lower().endswith(".mzxml"), fn + with mzxml.MzXML(fn) as ms_data: data = [x for x in ms_data] if read_only: return None - data = list(extract_mzxml(data)) + data = [_extract_mzxml(x) for x in data] + df = pd.json_normalize(data, sep="_") - df = ( - pd.DataFrame.from_dict(data) - .set_index("retentionTime") - .apply(pd.Series.explode) - .reset_index() - ) + # Convert retention time to seconds + if time_unit_in_file == "min": + df["scan_time"] = df["scan_time"].astype(np.float64) * 60.0 - df["retentionTime"] = df["retentionTime"].astype(np.float64) * 60.0 - df["m/z array"] = df["m/z array"].astype(np.float64) - df["intensity array"] = df["intensity array"].astype(np.float64) - - df = df.rename( - columns={ - "num": "scan_id", - "msLevel": "ms_level", - "retentionTime": "scan_time", - "m/z array": "mz", - "intensity array": "intensity", - } - ) - - df = df.reset_index(drop=True) - df = df[MS_FILE_COLUMNS] - return df + df = df.explode(["mz", "intensity"]) + set_dtypes(df) + return df.reset_index(drop=True)[MS_FILE_COLUMNS] def _extract_mzxml(data): - cols = [ - "num", - "msLevel", - "polarity", - "retentionTime", - "m/z array", - "intensity array", - ] - return {c: data[c] for c in cols} + return { + "scan_id": data["num"], + "ms_level": data["msLevel"], + "polarity": data["polarity"], + "scan_time": data["retentionTime"], + "mz": np.array(data["m/z array"]), + "intensity": np.array(data["intensity array"]), + } -extract_mzxml = np.vectorize(_extract_mzxml) +def mzml_to_pandas_df_pyteomics(fn, **kwargs): + # warnings.warn("mzml_to_pandas_df_pyteomics() is deprecated use mzxml_to_df() instead", DeprecationWarning) + return mzml_to_df(fn, **kwargs) -def mzml_to_pandas_df_pyteomics(fn, read_only=False): +def mzml_to_df(fn, time_unit="seconds", read_only=False): """ Reads mzML file and returns a pandas.DataFrame - using the pyteomics library. + using the mzML library. :param fn: Filename :type fn: str or PosixPath - :param read_only: Whether or not to convert to dataframe, defaults to False - :type read_only: bool, optional + :param explode: Whether to explode the DataFrame, defaults to True + :type explode: bool, optional :return: MS data :rtype: pandas.DataFrame """ - slices = [] - with mzml.MzML(fn) as ms_data: - - while True: - - try: - data = ms_data.next() - except Exception as e: - logging.warning(e) - break - - scan = data["scanList"]["scan"][0] - if "positive scan" in data.keys(): - data["polarity"] = "+" - - elif "negative scan" in data.keys(): - data["polarity"] = "-" + assert str(fn).lower().endswith(".mzml"), fn + + # Read mzML file using pyteomics + with mzml.read(str(fn)) as reader: + # Initialize empty lists for the slices and the attributes + slices = [] + + # Loop through the spectra and extract the data + for spectrum in reader: + # Extract the scan ID, retention time, m/z values, and intensity values + scan_id = int(spectrum["id"].split("scan=")[-1]) + rt = spectrum["scanList"]["scan"][0]["scan start time"] + mz = np.array(spectrum["m/z array"], dtype=np.float64) + intensity = np.array(spectrum["intensity array"], dtype=np.float64) + if "positive scan" in spectrum.keys(): + polarity = "+" + elif "negative scan" in spectrum.keys(): + polarity = "-" else: - data["polarity"] = None - - if "scan start time" in scan.keys(): - data["scan_time"] = scan["scan start time"] - - elif "scan time" in scan.keys(): - data["scan_time"] = scan["scan time"] - else: - logging.error(f"No scan time identified \n{scan}") - break - - del data["scanList"] - slices.append(pd.DataFrame(data)) + polarity = None + ms_level = spectrum["ms level"] + slices.append( + pd.DataFrame( + { + "scan_id": scan_id, + "mz": mz, + "intensity": intensity, + "polarity": polarity, + "ms_level": ms_level, + "scan_time": rt, + } + ) + ) df = pd.concat(slices) - - df["scan_id"] = df["id"].apply(lambda x: int(x.split("scan=")[1].split(" ")[0])) - df_to_numeric(df) - df["intensity array"] = df["intensity array"].astype(int) - - mapping = { - "m/z array": "mz", - "intensity array": "intensity", - "ms level": "ms_level", - } - - df = df.rename(columns=mapping) - df = df.reset_index(drop=True) - df = df[MS_FILE_COLUMNS] + df["intensity"] = df["intensity"].astype(int) + df = df[MS_FILE_COLUMNS].reset_index(drop=True) + set_dtypes(df) return df -def mzml_to_df(fn, time_unit="seconds", read_only=False): - """ - Reads mzML file and returns a pandas.DataFrame - using the mzML library. - - :param fn: Filename - :type fn: str or PosixPath - :param read_only: Whether or not to convert to dataframe, defaults to False - :type read_only: bool, optional - :return: MS data - :rtype: pandas.DataFrame - """ - with pymzml.run.Reader(fn) as ms_data: - data = [x for x in ms_data] - - if read_only: - return None - - data = list(extract_mzml(data, time_unit=time_unit)) - - df = ( - pd.DataFrame.from_dict(data) - .set_index(["scan_id", "ms_level", "polarity", "scan_time"]) - .apply(pd.Series.explode) - .reset_index() +def set_dtypes(df): + dtypes = dict( + mz=np.float32, + scan_id=np.int64, + ms_level=np.int8, + scan_time=np.float32, + intensity=np.int64, ) - df["mz"] = df["mz"].astype("float64") - df["intensity"] = df["intensity"].astype("float64") - df["scan_time"] = df["scan_time"] * 60.0 + for var, dtype in dtypes.items(): + if not df[var].dtype == dtype: + df[var] = df[var].astype(dtype) + return df @@ -250,7 +224,7 @@ def _extract_mzml(data, time_unit): "polarity": "+" if data["positive scan"] else "-", "scan_time": RT, "mz": peaks[:, 0].astype("float64"), - "intensity": peaks[:, 1].astype("float64"), + "intensity": peaks[:, 1].astype("int64"), } diff --git a/ms_mint/matplotlib_tools.py b/ms_mint/matplotlib_tools.py index 6e72f551..4ac56f2e 100644 --- a/ms_mint/matplotlib_tools.py +++ b/ms_mint/matplotlib_tools.py @@ -192,7 +192,7 @@ def plot_peak_shapes( dfs = [] for peak_label in peak_labels: - for ndx, row in R[ + for _, row in R[ (R.peak_label == peak_label) & (R.peak_n_datapoints > 1) ].iterrows(): peak_rt = [float(i) for i in row.peak_shape_rt.split(",")] @@ -200,7 +200,7 @@ def plot_peak_shapes( ms_file = row.ms_file mz = row.mz_mean rt = row.rt - + df = pd.DataFrame( { "Scan time [s]": peak_rt, @@ -265,7 +265,7 @@ def plot_peaks( ) for i, ( ndx, - (_, rt, rt_span, peak_base_height, peak_height, rt_min, rt_max), + (_, _, _, peak_base_height, _, rt_min, rt_max), ) in enumerate(peaks.iterrows()): if ndx in highlight: plt.axvspan(rt_min, rt_max, color="green", alpha=0.25, label="Selected") @@ -302,7 +302,6 @@ def plot_metabolomics_hist2d( mz_bins=100, **kwargs, ): - if set_dim: plt.figure(figsize=figsize, dpi=dpi) diff --git a/ms_mint/notebook.py b/ms_mint/notebook.py index 7d036722..650b5d7b 100644 --- a/ms_mint/notebook.py +++ b/ms_mint/notebook.py @@ -37,7 +37,6 @@ class Mint(_Mint_): """ def __init__(self, *args, **kwargs): - self.progress_callback = self._set_progress_ super().__init__(progress_callback=self.progress_callback, *args, **kwargs) @@ -107,7 +106,7 @@ def __init__(self, *args, **kwargs): self.tqdm = tqdm def _load_target_from_bytes_(self, value): - for fn, data in value["new"].items(): + for data in value["new"].values(): self.load(io.BytesIO(data["content"])) self._message_(f"{len(self.targets)} targets loaded.") @@ -142,7 +141,7 @@ def display(self): return self.layout def _run_(self, b=None, **kwargs): - self.message(f"Start processing...") + self.message("Start processing...") self.progress = 0 self.run(**kwargs) self.message("...finished processing.") diff --git a/ms_mint/pca.py b/ms_mint/pca.py index 5d9f9d18..53dea66d 100644 --- a/ms_mint/pca.py +++ b/ms_mint/pca.py @@ -167,5 +167,5 @@ def pairplot_plotly(self, df, **kwargs): fig = ff.create_scatterplotmatrix(df, index=color_col, **kwargs) # set the legendgroup equal to the marker color for t in fig.data: - t.legendgroup=t.marker.color + t.legendgroup = t.marker.color return fig diff --git a/ms_mint/plotly_tools.py b/ms_mint/plotly_tools.py index fadcb00e..f783c3e5 100644 --- a/ms_mint/plotly_tools.py +++ b/ms_mint/plotly_tools.py @@ -232,7 +232,6 @@ def plotly_peak_shapes( grouped by peak_label. """ mint_results = mint_results.copy() - if fns is not None: fns = [P(fn).name for fn in fns] @@ -240,10 +239,10 @@ def plotly_peak_shapes( mint_results.ms_file = [P(fn).name for fn in mint_results.ms_file] - logging.warning('TEST') + logging.warning("TEST") res = mint_results[mint_results.peak_max > 0] - + fns = list(res.ms_file.drop_duplicates()) labels = list(mint_results.peak_label.drop_duplicates()) diff --git a/ms_mint/plotting.py b/ms_mint/plotting.py index 721b662e..3889fad1 100644 --- a/ms_mint/plotting.py +++ b/ms_mint/plotting.py @@ -131,7 +131,7 @@ def hierarchical_clustering( if transposed: tmp_data = tmp_data.T - clustered, fig, ndx_x, ndx_y = hierarchical_clustering( + _, fig, ndx_x, ndx_y = hierarchical_clustering( tmp_data, vmin=vmin, vmax=vmax, @@ -281,7 +281,7 @@ def chromatogram(self, fns=None, peak_labels=None, interactive=False, **kwargs): if isinstance(fns, str): fns = [fns] - + if fns is not None: fns = tuple(fns) @@ -293,7 +293,7 @@ def chromatogram(self, fns=None, peak_labels=None, interactive=False, **kwargs): if peak_labels is not None: peak_labels = tuple(peak_labels) - + data = self.mint.get_chromatograms(fns=fns, peak_labels=peak_labels) if not interactive: @@ -314,7 +314,7 @@ def chromatogram(self, fns=None, peak_labels=None, interactive=False, **kwargs): g = sns.relplot(data=data, **params) for peak_label, ax in zip(peak_labels, g.axes.flatten()): - mz_mean, mz_width, rt_min, rt_max = self.mint.get_target_params( + _, _, rt_min, rt_max = self.mint.get_target_params( peak_label ) if rt_min is not None and rt_max is not None: diff --git a/ms_mint/processing.py b/ms_mint/processing.py index d90d11c7..03518bbb 100644 --- a/ms_mint/processing.py +++ b/ms_mint/processing.py @@ -36,7 +36,6 @@ def process_ms1_files_in_parallel(args): """ Pickleable function for (parallel) peak integration. """ - filename = args["filename"] targets = args["targets"] output_fn = args["output_fn"] @@ -51,6 +50,8 @@ def process_ms1_files_in_parallel(args): logging.error(f"process_ms1_files_in_parallel(): {e}") results = pd.DataFrame() + print(len(results)) + if (output_fn is not None) and (len(results) > 0): append_results(results, output_fn) return None @@ -89,6 +90,7 @@ def process_ms1_file(filename, targets): results["ms_path"] = os.path.dirname(filename) results["ms_file_size"] = os.path.getsize(filename) / 1024 / 1024 results["peak_score"] = score_peaks(results) + print(results) return results[MINT_RESULTS_COLUMNS] @@ -120,8 +122,6 @@ def _process_ms1_from_df_(df, targets): "peak_label", ] array_peaks = targets[peak_cols].values - # if "ms_level" in df.columns: - # df = df[df.ms_level == 1] array_data = df[["scan_time", "mz", "intensity"]].values result = process_ms1_from_numpy(array_data, array_peaks) return result @@ -139,7 +139,7 @@ def process_ms1_from_numpy(array, peaks): :rtype: list """ results = [] - for (mz_mean, mz_width, rt_min, rt_max, intensity_threshold, peak_label) in peaks: + for mz_mean, mz_width, rt_min, rt_max, intensity_threshold, peak_label in peaks: props = _process_ms1_from_numpy( array, mz_mean=mz_mean, @@ -190,6 +190,15 @@ def extract_ms1_properties(array, mz_mean): int_list_to_comma_sep_str = lambda x: ",".join([str(int(i)) for i in x]) projection = pd.DataFrame(array[:, [0, 2]], columns=["rt", "int"]) + + print("DEBUG extract_ms1_properties") + print(array) + print(array.dtype) + print(array.shape) + print(len(projection)) + print(projection) + print(projection.dtypes) + projection["rt"] = projection["rt"].round(2) projection["int"] = projection["int"].astype(int) projection = projection.groupby("rt").max().reset_index().values @@ -252,7 +261,7 @@ def extract_ms1_properties(array, mz_mean): peak_shape_int = int_list_to_comma_sep_str(projection[:, 1]) # Check that peak projection arrays (rt, int) have same number of elements - assert len(peak_shape_rt.split(',')) == len(peak_shape_int.split(',')) + assert len(peak_shape_rt.split(",")) == len(peak_shape_int.split(",")) return dict( peak_area=peak_area, diff --git a/ms_mint/targets.py b/ms_mint/targets.py index 304e4af7..90fd8b7b 100644 --- a/ms_mint/targets.py +++ b/ms_mint/targets.py @@ -190,9 +190,9 @@ def gen_target_grid(masses, dt, rt_max=10, mz_ppm=10, intensity_threshold=0): targets.columns = ["mz_mean", "rt_min"] targets["rt_max"] = targets.rt_min + (1 * dt) targets["peak_label"] = ( - targets.mz_mean.apply(lambda x: "{:.3f}".format(x)) + targets.mz_mean.apply("{:.3f}".format) + "__" - + targets.rt_min.apply(lambda x: "{:2.2f}".format(x)) + + targets.rt_min.apply("{:2.2f}".format) ) targets["mz_width"] = mz_ppm targets["intensity_threshold"] = intensity_threshold @@ -246,14 +246,13 @@ def rt_min_max( minimum_intensity=1e4, plot=False, sigma=20, - filter=None, + filters=None, post_opt=False, post_opt_kwargs=None, rel_height=0.9, height=3, aspect=2, col_wrap=3, - verbose=False, **kwargs, ): """ @@ -272,8 +271,8 @@ def rt_min_max( :type plot: bool, optional :param sigma: Sigma value for peak selection, defaults to 20 :type sigma: float, optional - :param filter: Filter instances to apply in respective order, defaults to None - :type filter: ms_mint.filter.Filter, optional + :param filters: Filter instances to apply in respective order, defaults to None + :type filters: ms_mint.filters.Filter, optional :param post_opt: Optimize retention times after peak selection, defaults to False :type post_opt: bool, optional :param post_opt_kwargs: _description_, defaults to 20 @@ -291,17 +290,8 @@ def rt_min_max( if peak_labels is None: peak_labels = targets.peak_label.values - if verbose: - print(targets) - print(fns) - print(peak_labels) - _targets = targets.set_index("peak_label").copy() - if verbose: - print(_targets) - - print("Reading files...") ms1 = pd.concat([ms_file_to_df(fn) for fn in tqdm(fns)]).sort_values( ["scan_time", "mz"] ) @@ -311,10 +301,7 @@ def rt_min_max( fig = plt.figure(figsize=(col_wrap * height * aspect, n_rows * height)) i = 0 - for (peak_label, row) in tqdm(_targets.iterrows(), total=len(targets)): - - if verbose: - print(peak_label) + for peak_label, row in tqdm(_targets.iterrows(), total=len(targets)): if peak_label not in peak_labels: logging.warning(f"{peak_label} not in {peak_labels}") continue @@ -325,7 +312,7 @@ def rt_min_max( _slice = extract_chromatogram_from_ms1(ms1, mz).groupby("scan_time").sum() chrom = Chromatogram( - _slice.index, _slice.values, expected_rt=rt, filter=filter + _slice.index, _slice.values, expected_rt=rt, filters=filters ) if chrom.x.max() < minimum_intensity: @@ -334,7 +321,7 @@ def rt_min_max( ) continue - chrom.apply_filter() + chrom.apply_filters() chrom.find_peaks(rel_height=rel_height) chrom.select_peak_with_gaussian_weight(rt, sigma) @@ -351,9 +338,6 @@ def rt_min_max( rt_min = chrom.peaks.at[ndx, "rt_min"] rt_max = chrom.peaks.at[ndx, "rt_max"] - if verbose: - print(ndx, peak_label, rt_min, rt_max) - _targets.loc[peak_label, ["rt_min", "rt_max"]] = rt_min, rt_max if plot: diff --git a/setup.cfg b/setup.cfg index d34b75b7..aa633717 100755 --- a/setup.cfg +++ b/setup.cfg @@ -7,6 +7,8 @@ max-line-length = 160 [tool:pytest] norecursedirs = build dist ms_mint scripts docs notebooks python_files = test_*.py filelock.py +env = + MPLBACKEND='Agg' [versioneer] vcs = git diff --git a/tests/test__chromatogram.py b/tests/test__chromatogram.py index ffe76ee2..a27d197a 100644 --- a/tests/test__chromatogram.py +++ b/tests/test__chromatogram.py @@ -2,6 +2,9 @@ import pandas as pd from matplotlib import pyplot as plt +import matplotlib as mpl + +mpl.use("Agg") from ms_mint.chromatogram import Chromatogram from ms_mint.tools import gaussian @@ -9,15 +12,14 @@ from paths import TEST_MZML -def test__Chromatogram_without_filter_identifies_peaks_correctly(): - +def test__Chromatogram_without_filters_identifies_peaks_correctly(): rt_peak_1 = 200 rt_peak_2 = 500 x = np.arange(1000) y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_2, 100) * 5e5 - chrom = Chromatogram(scan_times=x, intensities=y, filter=None, expected_rt=2) + chrom = Chromatogram(scan_times=x, intensities=y, filters=None, expected_rt=2) chrom.find_peaks() rt_peak_1_pred, rt_peak_2_pred = chrom.peaks.rt.values @@ -28,16 +30,15 @@ def test__Chromatogram_without_filter_identifies_peaks_correctly(): ] -def test__Chromatogram__with_filter_identifies_peaks_correctly(): - +def test__Chromatogram__with_filters_identifies_peaks_correctly(): rt_peak_1 = 200 rt_peak_2 = 500 x = np.arange(1000) y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_2, 100) * 5e5 - chrom = Chromatogram(scan_times=x, intensities=y, filter=[], expected_rt=2) - chrom.apply_filter() + chrom = Chromatogram(scan_times=x, intensities=y, filters=[], expected_rt=2) + chrom.apply_filters() chrom.find_peaks() rt_peak_1_pred, rt_peak_2_pred = chrom.peaks.rt.values @@ -46,8 +47,7 @@ def test__Chromatogram__with_filter_identifies_peaks_correctly(): assert max_diff < 1 -def test__Chromatogram__with_filter_and_opt_identifies_peaks_correctly(): - +def test__Chromatogram__with_filters_and_opt_identifies_peaks_correctly(): rt_peak_1 = 200 rt_peak_2 = 500 @@ -55,8 +55,8 @@ def test__Chromatogram__with_filter_and_opt_identifies_peaks_correctly(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_2, 100) * 5e5 - chrom = Chromatogram(scan_times=x, intensities=y, filter=None, expected_rt=2) - chrom.apply_filter() + chrom = Chromatogram(scan_times=x, intensities=y, filters=None, expected_rt=2) + chrom.apply_filters() chrom.find_peaks() chrom.optimise_peak_times_with_diff() rt_peak_1_pred, rt_peak_2_pred = chrom.peaks.rt.values @@ -67,7 +67,6 @@ def test__Chromatogram__with_filter_and_opt_identifies_peaks_correctly(): def test__Chromatogram__select_peak_by_rt(): - rt_peak_1 = 200 rt_peak_2 = 500 @@ -76,7 +75,7 @@ def test__Chromatogram__select_peak_by_rt(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_2, 100) * 5e5 chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=rt_peak_1 + scan_times=x, intensities=y, filters=None, expected_rt=rt_peak_1 ) chrom.find_peaks() chrom.select_peak_by_rt() @@ -88,7 +87,6 @@ def test__Chromatogram__select_peak_by_rt(): def test__Chromatogram__select_peak_by_rt_as_argument(): - rt_peak_1 = 200 rt_peak_2 = 500 @@ -97,7 +95,7 @@ def test__Chromatogram__select_peak_by_rt_as_argument(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_2, 100) * 5e5 chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=rt_peak_1 + scan_times=x, intensities=y, filters=None, expected_rt=rt_peak_1 ) chrom.find_peaks() chrom.select_peak_by_rt(rt_peak_2) @@ -109,7 +107,6 @@ def test__Chromatogram__select_peak_by_rt_as_argument(): def test__Chromatogram__select_highest(): - rt_peak_1 = 200 rt_peak_2 = 500 @@ -118,7 +115,7 @@ def test__Chromatogram__select_highest(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_1, 100) * 5e5 chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=rt_peak_2 + scan_times=x, intensities=y, filters=None, expected_rt=rt_peak_2 ) chrom.find_peaks() chrom.select_peak_by_highest_intensity() @@ -130,7 +127,6 @@ def test__Chromatogram__select_highest(): def test__Chromatogram__select_peak_with_gaussian_prefers_higher_peak(): - higher_peak_rt = 400 higher_peak_in = 5e6 lower__peak_rt = 600 @@ -148,7 +144,7 @@ def test__Chromatogram__select_peak_with_gaussian_prefers_higher_peak(): ) chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=expected_rt + scan_times=x, intensities=y, filters=None, expected_rt=expected_rt ) chrom.find_peaks() chrom.select_peak_with_gaussian_weight() @@ -160,7 +156,6 @@ def test__Chromatogram__select_peak_with_gaussian_prefers_higher_peak(): def test__Chromatogram__plot_runs_without_error_return_figure(): - peak_rt = 400 peak_in = 5e6 @@ -168,7 +163,7 @@ def test__Chromatogram__plot_runs_without_error_return_figure(): y = gaussian(x, peak_rt, 200) * peak_in - chrom = Chromatogram(scan_times=x, intensities=y, filter=None) + chrom = Chromatogram(scan_times=x, intensities=y, filters=None) chrom.find_peaks() fig = chrom.plot() @@ -177,10 +172,9 @@ def test__Chromatogram__plot_runs_without_error_return_figure(): def test__Chromatogram_from_files_runs_through(): - chrom = Chromatogram() chrom.from_file(TEST_MZML, mz_mean=101.024323, mz_width=10) - chrom.apply_filter() + chrom.apply_filters() chrom.find_peaks() chrom.select_peak_by_highest_intensity() peaks = chrom.selected_peak_ndxs @@ -192,7 +186,6 @@ def test__Chromatogram_from_files_runs_through(): def test__Chromatogram__optimise_peak_times_with_diff_with_plot(): - rt_peak_1 = 200 rt_peak_2 = 500 @@ -201,14 +194,13 @@ def test__Chromatogram__optimise_peak_times_with_diff_with_plot(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_1, 100) * 5e5 chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=rt_peak_2 + scan_times=x, intensities=y, filters=None, expected_rt=rt_peak_2 ) chrom.find_peaks() chrom.optimise_peak_times_with_diff(plot=True) def test__Chromatogram__data(): - rt_peak_1 = 200 rt_peak_2 = 500 @@ -217,7 +209,7 @@ def test__Chromatogram__data(): y = gaussian(x, rt_peak_1, 20) * 5e6 + gaussian(x, rt_peak_1, 100) * 5e5 chrom = Chromatogram( - scan_times=x, intensities=y, filter=None, expected_rt=rt_peak_2 + scan_times=x, intensities=y, filters=None, expected_rt=rt_peak_2 ) data = chrom.data diff --git a/tests/test__io.py b/tests/test__io.py index c24b8c66..9aa827d2 100644 --- a/tests/test__io.py +++ b/tests/test__io.py @@ -39,20 +39,6 @@ def test__ms_file_to_df__mzML(): assert expected_cols == result.columns.to_list(), result.columns -def test__ms_file_to_df__mzML_timeunit_minutes(): - result = ms_file_to_df(TEST_MZML, time_unit="minutes") - expected_cols = [ - "scan_id", - "ms_level", - "polarity", - "scan_time", - "mz", - "intensity", - ] - assert isinstance(result, pd.DataFrame), f"{type(result)} is not a dataframe" - assert expected_cols == result.columns.to_list(), result.columns - - def test__ms_file_to_df__mzXML(): result = ms_file_to_df(TEST_MZXML) expected_cols = [ @@ -67,8 +53,17 @@ def test__ms_file_to_df__mzXML(): assert expected_cols == result.columns.to_list(), result.columns -def test__mzml_to_pandas_df_pyteomics_pos(): - result = mzml_to_pandas_df_pyteomics(TEST_MZML_POS) +def test__mzml_to_pandas_df_pyteomics_positive_mode(): + """Test the mzml_to_pandas_df_pyteomics function with positive mode data. + + This function tests the mzml_to_pandas_df_pyteomics function by checking the output + against expected values for positive mode data. + + Raises: + AssertionError: If any of the assertions fail. + + """ + # Arrange expected_cols = [ "scan_id", "ms_level", @@ -77,6 +72,11 @@ def test__mzml_to_pandas_df_pyteomics_pos(): "mz", "intensity", ] + + # Act + result = mzml_to_pandas_df_pyteomics(TEST_MZML_POS) + + # Assert assert isinstance(result, pd.DataFrame), f"{type(result)} is not a dataframe" assert expected_cols == result.columns.to_list(), result.columns assert all(result.polarity == "+"), f'Polarity should be "+"\n{result}' @@ -158,7 +158,7 @@ def test__convert_ms_file_to_feather(tmpdir): assert df_fea.equals(df), "DataFrames not equal" -def test__convert_ms_file_to_parquet(tmpdir): +def test__convert_mzml_file_to_parquet(tmpdir): print(tmpdir) shutil.copy(TEST_MZML, tmpdir) fn = P(tmpdir) / P(TEST_MZML).name @@ -168,6 +168,18 @@ def test__convert_ms_file_to_parquet(tmpdir): assert fn_out.is_file(), f"File not generated {fn_out}" df = ms_file_to_df(fn) df_fea = ms_file_to_df(fn_out) + + print(df) + print(df.dtypes) + + print(df_fea) + print(df_fea.dtypes) + + print(df == df_fea) + + print(df.scan_time.values[0], df_fea.scan_time.values[0]) + print(df.scan_time.values[-1], df_fea.scan_time.values[-1]) + assert df_fea.equals(df), "DataFrames not equal" diff --git a/tests/test__matplotlib_tools.py b/tests/test__matplotlib_tools.py index 06583d90..7b4b7aa4 100644 --- a/tests/test__matplotlib_tools.py +++ b/tests/test__matplotlib_tools.py @@ -1,6 +1,7 @@ import seaborn as sns import matplotlib as mpl +mpl.use("Agg") from ms_mint import Mint diff --git a/tests/test__plotly_tools.py b/tests/test__plotly_tools.py index 1fcb48c4..d2877aac 100644 --- a/tests/test__plotly_tools.py +++ b/tests/test__plotly_tools.py @@ -1,63 +1,28 @@ import pandas as pd import numpy as np - +import pytest from plotly.graph_objs._figure import Figure - from ms_mint.plotly_tools import ( set_template, plotly_heatmap, ) - -def test__plotly_heatmap(): - N = 10 - data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 - df = pd.DataFrame(data) - img = plotly_heatmap(df) - assert isinstance(img, Figure), type(img) - - -def test__plotly_heatmap__transposed(): - N = 10 - data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 - df = pd.DataFrame(data) - img = plotly_heatmap(df, transposed=True) - assert isinstance(img, Figure), type(img) - - -def test__plotly_heatmap__normed_by_cols(): - N = 10 - data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 - df = pd.DataFrame(data) - img = plotly_heatmap(df, normed_by_cols=True) - assert isinstance(img, Figure), type(img) - - -def test__plotly_heatmap__correlation(): +@pytest.mark.parametrize("transposed,normed_by_cols,correlation,clustered,add_dendrogram", [ + (False, False, False, False, False), + (True, False, False, False, False), + (False, True, False, False, False), + (False, False, True, False, False), + (False, False, False, True, True), + (False, False, True, True, False), +]) +def test__plotly_heatmap(transposed, normed_by_cols, correlation, clustered, add_dendrogram): N = 10 data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 df = pd.DataFrame(data) - img = plotly_heatmap(df, correlation=True) + img = plotly_heatmap(df, transposed=transposed, normed_by_cols=normed_by_cols, correlation=correlation, clustered=clustered, add_dendrogram=add_dendrogram) assert isinstance(img, Figure), type(img) - -def test__plotly_heatmap__clustered_with_dendrogram(): - N = 10 - data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 - df = pd.DataFrame(data) - img = plotly_heatmap(df, clustered=True, add_dendrogram=True) - assert isinstance(img, Figure), type(img) - - -def test__plotly_heatmap__clustered_correlation(): - N = 10 - data = np.random.uniform(size=(N, N)) + np.arange(N) - N / 2 - df = pd.DataFrame(data) - img = plotly_heatmap(df, clustered=True, add_dendrogram=False, correlation=True) - assert isinstance(img, Figure), type(img) - - def test__set_template(): set_template() - assert True + assert True \ No newline at end of file diff --git a/tests/test__processing.py b/tests/test__processing.py index c06a8bb1..1bfede95 100644 --- a/tests/test__processing.py +++ b/tests/test__processing.py @@ -5,6 +5,7 @@ from pathlib import Path as P from multiprocessing import Pool from ms_mint import processing +from ms_mint.io import ms_file_to_df from ms_mint.standards import MINT_RESULTS_COLUMNS from paths import TEST_MZXML @@ -99,7 +100,19 @@ def test__slice_ms1_array(): assert np.array_equal(result, expect) -def test__process_ms1_from_numpy(): +def test_process_ms1_from_numpy(): + """ + Test the processing of MS1 data from a numpy array. + + The function processes an array of MS1 data and returns a list of + tuples containing the peak label, maximum intensity, apex RT, start + RT, end RT, intensity at start RT, intensity at end RT, mz, and + other fields. + + :return: None + :rtype: None + """ + array = np.array([[1, 100, 2], [2, 200, 3], [3, 300, 7]]) targets = [(100, 10, 0.5, 1.5, 0, "A"), (200, 10, 1.5, 2.5, 0, "B")] @@ -127,30 +140,41 @@ def test__extract_chromatogram_from_ms1(): assert result.equals(expected) -def test__run_parallel(tmp_path): +def test_run_parallel(tmp_path): + """ + Test running parallel processing of multiple MS files and storing the results in a CSV file. + :param tmp_path: A temporary directory created by pytest. + :type tmp_path: py.path.local + """ + + # Define test targets targets = pd.DataFrame( { "peak_label": ["A"], "mz_mean": [128.034], "mz_width": [100], "intensity_threshold": [0], - "rt_min": [286 / 60], - "rt_max": [330 / 60], - "rt": [300 / 60], + "rt_min": [0], + "rt_max": [700], + "rt": [150], + "rt_unit": ["s"], "target_filename": ["unknown"], } ) + # Initialize multiprocessing pool pool = Pool(processes=2, maxtasksperchild=None) - N = 4 - fns = [P(tmp_path) / f"File-{i}.mzXML" for i in range(N)] - - args_list = [] + # Generate test MS files + n_files = 4 + fns = [P(tmp_path) / f"File-{i}.mzXML" for i in range(n_files)] for fn in fns: os.symlink(TEST_MZXML, fn) + # Generate list of arguments for parallel processing + args_list = [] + for fn in fns: args = { "filename": fn, "targets": targets, @@ -158,20 +182,36 @@ def test__run_parallel(tmp_path): "mode": None, "output_fn": None, } - args_list.append(args) + # Run parallel processing results = pool.map_async(processing.process_ms1_files_in_parallel, args_list) - pool.close() pool.join() + # Concatenate results result = pd.concat(results.get()) + + # Print info for debugging + print(targets) + print(fns) print(result) + print(result.shape) + print(ms_file_to_df(fn)) + + # Assert that the results have the expected length + assert len(result) == n_files * len(targets) def test__run_parallel_with_output_filename(tmp_path): + """ + Test for running parallel processing of multiple MS files and storing the results in a CSV file. + :param tmp_path: A temporary directory created by pytest. + :type tmp_path: py.path.local + """ + + # Create targets DataFrame targets = pd.DataFrame( { "peak_label": ["A"], @@ -186,15 +226,15 @@ def test__run_parallel_with_output_filename(tmp_path): } ) + # Set up multiprocessing pool pool = Pool(processes=2, maxtasksperchild=None) N = 4 fns = [P(tmp_path) / f"File-{i}.mzXML" for i in range(N)] output_fn = P(tmp_path) / "results.csv" + # Create argument list and symlink test MS files args_list = [] - # Create files in tmp_path and add - # parameters to argument list. for fn in fns: os.symlink(TEST_MZXML, fn) @@ -206,6 +246,11 @@ def test__run_parallel_with_output_filename(tmp_path): "output_fn": output_fn, } + # Print some info for debugging + print(fn) + print(ms_file_to_df(fn).head()) + print(ms_file_to_df(fn).tail()) + print("*" * 80) args_list.append(args) # Prepare output file (only headers) @@ -219,11 +264,16 @@ def test__run_parallel_with_output_filename(tmp_path): returned_results = results.get() + print(returned_results) + # Expect all returned results to be None - assert all([i is None for i in returned_results]), returned_results + assert all(e is None for e in returned_results), returned_results # All results should be stored in the output file assert output_fn.is_file() stored_results = pd.read_csv(output_fn) + print("=" * 80) + print(output_fn) + print(stored_results) assert len(stored_results) == N * len(targets) diff --git a/tests/test__targets.py b/tests/test__targets.py index 1764a6ec..8165f36b 100644 --- a/tests/test__targets.py +++ b/tests/test__targets.py @@ -19,7 +19,7 @@ from paths import ( # TEST_TARGETS_FN_V2_CSV_SEC, - #TEST_TARGETS_FN_V0_XLSX, + # TEST_TARGETS_FN_V0_XLSX, TEST_TARGETS_FN_V0, TEST_TARGETS_FN_V1, ) diff --git a/tests/test__tools.py b/tests/test__tools.py index dccba14d..0d9a5ded 100644 --- a/tests/test__tools.py +++ b/tests/test__tools.py @@ -93,7 +93,6 @@ def test__scale_dataframe_robust(): def test__df_diff(): - df1 = pd.DataFrame( {"A": [1, 1, 0, 0, 2], "B": [1, 0, 0, 1, 2]}, index=[1, 2, 3, 4, 5] ) @@ -119,7 +118,6 @@ def test__df_diff(): def test__is_ms_file(): - pos = [ "file.mzML", "path/file.mzML",