Skip to content

Commit

Permalink
Check time unit in mzML files and convert to second
Browse files Browse the repository at this point in the history
  • Loading branch information
sorenwacker committed Apr 26, 2023
1 parent 2197a4d commit 087cd80
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 19 deletions.
4 changes: 2 additions & 2 deletions images/coverage.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 18 additions & 17 deletions ms_mint/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,17 @@ def ms_file_to_df(fn, read_only: bool = False):
except IndexError as e:
logging.warning(f"{e}: {fn}")
return None
# Compatibility with old schema
if not read_only:
# Compatibility with old schema
df = df.rename(
columns={
"retentionTime": "scan_time",
"intensity array": "intensity",
"m/z array": "mz",
}
)

set_dtypes(df)
# Set datatypes
set_dtypes(df)

# assert df.scan_id.dtype in [np.int32, np.int64], df.scan_id.dtype
# assert df.intensity.dtype == np.int64, df.intensity.dtype
Expand Down Expand Up @@ -139,7 +139,7 @@ def mzml_to_pandas_df_pyteomics(fn, **kwargs):
return mzml_to_df(fn, **kwargs)


def mzml_to_df(fn, time_unit="seconds", read_only=False):
def mzml_to_df(fn, read_only=False):
"""
Reads mzML file and returns a pandas.DataFrame
using the mzML library.
Expand All @@ -154,16 +154,20 @@ def mzml_to_df(fn, time_unit="seconds", read_only=False):

assert str(fn).lower().endswith(".mzml"), fn

# Read mzML file using pyteomics
# Read mzML file using pyteomics
with mzml.read(str(fn)) as reader:
# Initialize empty lists for the slices and the attributes
slices = []

# Loop through the spectra and extract the data
for spectrum in reader:
time_unit = spectrum['scanList']['scan'][0]['scan start time'].unit_info
if read_only:
continue
# Extract the scan ID, retention time, m/z values, and intensity values
scan_id = int(spectrum["id"].split("scan=")[-1])
rt = spectrum["scanList"]["scan"][0]["scan start time"]
if time_unit == 'minute':
rt = rt * 60.
mz = np.array(spectrum["m/z array"], dtype=np.float64)
intensity = np.array(spectrum["intensity array"], dtype=np.float64)
if "positive scan" in spectrum.keys():
Expand All @@ -185,7 +189,8 @@ def mzml_to_df(fn, time_unit="seconds", read_only=False):
}
)
)

if read_only:
return None
df = pd.concat(slices)
df["intensity"] = df["intensity"].astype(int)
df = df[MS_FILE_COLUMNS].reset_index(drop=True)
Expand All @@ -210,13 +215,7 @@ def set_dtypes(df):


def _extract_mzml(data, time_unit):
try:
RT = data.scan_time_in_minutes()
except Exception:
if time_unit == "seconds":
RT = data.scan_time[0]
elif time_unit == "minutes":
RT = data.scan_time[0] * 60.0
RT = data.scan_time_in_minutes() * 60
peaks = data.peaks("centroided")
return {
"scan_id": data["id"],
Expand Down Expand Up @@ -273,6 +272,7 @@ def format_thermo_raw_file_reader_parquet(df):
}
)
)
df["scan_time"] = df["scan_time"]*60
df["polarity"] = None
df["intensity"] = df.intensity.astype(np.float64)
df = df[MS_FILE_COLUMNS]
Expand All @@ -298,10 +298,9 @@ def mzmlb_to_df__pyteomics(fn, read_only=False):
return None

data = list(extract_mzmlb(data))

df = (
pd.DataFrame.from_dict(data)
.set_index("retentionTime")
.set_index(["index", "retentionTime"])
.apply(pd.Series.explode)
.reset_index()
.rename(
Expand All @@ -315,14 +314,16 @@ def mzmlb_to_df__pyteomics(fn, read_only=False):
)
)

# mzMLb starts scan index with 0
df["scan_id"] = df["scan_id"] + 1
df["polarity"] = None
df = df[MS_FILE_COLUMNS]
return df


def _extract_mzmlb(data):
cols = ["index", "ms level", "retentionTime", "m/z array", "intensity array"]
data["retentionTime"] = data["scanList"]["scan"][0]["scan start time"] / 60
data["retentionTime"] = data["scanList"]["scan"][0]["scan start time"] * 60
return {c: data[c] for c in cols}


Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ nbformat>=4.2.0
sphinx
sphinx-rtd-theme
myst-parser
psims
2 changes: 2 additions & 0 deletions tests/test__chromatogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,15 @@ def test__Chromatogram__plot_runs_without_error_return_figure():


def test__Chromatogram_from_files_runs_through():
print(TEST_MZML)
chrom = Chromatogram()
chrom.from_file(TEST_MZML, mz_mean=101.024323, mz_width=10)
chrom.apply_filters()
chrom.find_peaks()
chrom.select_peak_by_highest_intensity()
peaks = chrom.selected_peak_ndxs


print(chrom.data)
print(peaks)

Expand Down

0 comments on commit 087cd80

Please sign in to comment.