diff --git a/CHANGELOG.md b/CHANGELOG.md index d6f07e9..b9a6edd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## [1.1.0] - 2024-09-05 ### Added - `Peptidoform`: Add `modified_sequence` property to return the modified sequence in ProForma format, but without charge state. +- `io`: Add support for reading and writing FlashLFQ generic TSV files. ## [1.0.1] - 2024-08-28 diff --git a/README.rst b/README.rst index 257ee33..a488519 100644 --- a/README.rst +++ b/README.rst @@ -89,6 +89,7 @@ Supported file formats ===================================================================================================================== ======================== =============== =============== File format psm_utils tag Read support Write support ===================================================================================================================== ======================== =============== =============== + `FlashLFQ generic TSV `_ ``flashlfq`` ✅ ✅ `ionbot CSV `_ ``ionbot`` ✅ ❌ `OpenMS idXML `_ ``idxml`` ✅ ✅ `MaxQuant msms.txt `_ ``msms`` ✅ ❌ diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst index d858997..b20680b 100644 --- a/docs/source/api/psm_utils.io.rst +++ b/docs/source/api/psm_utils.io.rst @@ -7,6 +7,14 @@ psm_utils.io +psm_utils.io.flashlfq +################## + +.. automodule:: psm_utils.io.flashlfq + :members: + :inherited-members: + + psm_utils.io.idxml ################## diff --git a/example_files/example.flashlfq.tsv b/example_files/example.flashlfq.tsv new file mode 100644 index 0000000..e763f7b --- /dev/null +++ b/example_files/example.flashlfq.tsv @@ -0,0 +1,18 @@ +File Name Scan Retention Time Precursor Charge Base Sequence Full Sequence Peptide Monoisotopic Mass Protein Accession +SmallCalibratible_Yeast 24.80555 2 KAPAGGAADAAAK KAPAGGAADAAAK +SmallCalibratible_Yeast 24.95372 2 KAPAAAPAASK KAPAAAPAASK +SmallCalibratible_Yeast 24.77032 2 KQAIETANK KQAIETANK +SmallCalibratible_Yeast 24.17319 2 RVDEGGAQDK RVDEGGAQDK +SmallCalibratible_Yeast 24.26695 2 KDAEPQSDSTTSK KDAEPQSDSTTSK +SmallCalibratible_Yeast 24.10798 2 EKAEAEAEK EKAEAEAEK +SmallCalibratible_Yeast 24.06874 2 EKAEAEAEK EKAEAEAEK +SmallCalibratible_Yeast 24.77398 2 FKEEDEKESQR FKEEDEKESQR +SmallCalibratible_Yeast 24.90638 2 YDHEASSSYK YDHEASSSYK +SmallCalibratible_Yeast 24.40345 3 SKDVTDSATTKK SKDVTDSATTKK +SmallCalibratible_Yeast 24.71679 2 FKEEDEKESQR FKEEDEKESQR +SmallCalibratible_Yeast 24.39968 2 ALKQEGAANK ALKQEGAANK +SmallCalibratible_Yeast 24.67303 2 SKDVTDSATTK SKDVTDSATTK +SmallCalibratible_Yeast 24.45053 2 KLEDHPK KLEDHPK +SmallCalibratible_Yeast 24.77398 1 HIDAGAK HIDAGAK +SmallCalibratible_Yeast 24.9022 2 YLAKEEEKK YLAKEEEKK +SmallCalibratible_Yeast 24.76278 2 YAGEVSHDDK YAGEVSHDDK diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index b82c8ee..28bd0ee 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -1,6 +1,6 @@ """Common utilities for parsing and handling PSMs, and search engine results.""" -__version__ = "1.0.1" +__version__ = "1.1.0" __all__ = ["Peptidoform", "PSM", "PSMList"] from warnings import filterwarnings diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 79f09ac..0b4bb66 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -8,6 +8,7 @@ from rich.progress import track +import psm_utils.io.flashlfq as flashlfq import psm_utils.io.idxml as idxml import psm_utils.io.ionbot as ionbot import psm_utils.io.maxquant as maxquant @@ -28,6 +29,12 @@ from psm_utils.psm_list import PSMList FILETYPES = { + "flashlfq": { + "reader": flashlfq.FlashLFQReader, + "writer": flashlfq.FlashLFQWriter, + "extension": ".tsv", + "filename_pattern": r"^.*\.flashlfq\.tsv$", + }, "ionbot": { "reader": ionbot.IonbotReader, "writer": None, diff --git a/psm_utils/io/flashlfq.py b/psm_utils/io/flashlfq.py new file mode 100644 index 0000000..d91c061 --- /dev/null +++ b/psm_utils/io/flashlfq.py @@ -0,0 +1,228 @@ +""" +Reader and writer for the FlashLFQ generic TSV format. + +See the `FlashLFQ documentation `_ +for more information on the format. + +Notes +----- +- The FlashLFQ format does not contain the actual spectrum identifier. When reading a FlashLFQ + file, the spectrum identifier is set to the row number in the file. +- The FlashLFQ format does not contain the precursor m/z, but the theoretical monoisotopic mass. + This value is not read into the PSM object, but can be calculated from the peptidoform. +- To read from a FlashLFQ file, the ``Full Sequence`` column is expected to contain a ProForma v2 + compatible peptidoform notation. + +""" + +from __future__ import annotations + +import csv +import logging +from pathlib import Path +from typing import Optional, Union + +import numpy as np + +from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +set_csv_field_size_limit() + +LOGGER = logging.getLogger(__name__) + + +class FlashLFQReader(ReaderBase): + """Reader for FlashLFQ TSV format.""" + + required_columns = ["Full Sequence", "Precursor Charge"] + + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + with open(self.filename, "rt") as open_file: + reader = csv.DictReader(open_file, delimiter="\t") + if not all(col in reader.fieldnames for col in self.required_columns): + raise PSMUtilsIOException( + f"FlashLFQ TSV file must contain the following columns: {self.required_columns}" + ) + for i, row in enumerate(reader): + yield self._parse_entry(row, spectrum_id=str(i)) + + def _parse_entry(self, entry: dict, spectrum_id) -> PSM: + """Parse single FlashLFQ TSV entry to :py:class:`~psm_utils.psm.PSM`.""" + # Replace empty strings with None + entry = {k: v if v else None for k, v in entry.items()} + + # Parse entry + return PSM( + peptidoform=f"{entry['Full Sequence']}/{entry['Precursor Charge']}", + spectrum_id=spectrum_id, + run=entry.get("File Name"), + retention_time=entry.get("Scan Retention Time"), + protein_list=self._parse_protein_list(entry.get("Protein Accessions")), + ) + + @staticmethod + def _parse_protein_list(protein_accessions: Optional[str]) -> list[str]: + """Parse protein list string to list of protein accessions.""" + if not protein_accessions: + return [] + elif ";" in protein_accessions: # Docs define separator as semicolon + return protein_accessions.split(";") + elif "|" in protein_accessions: # Example file uses pipe + return protein_accessions.split("|") + else: + return [protein_accessions] # Single protein + + +class FlashLFQWriter(WriterBase): + """Reader for FlashLFQ TSV format.""" + + def __init__( + self, + filename: Union[str, Path], + *args, + fdr_threshold: float = 0.01, + only_targets: bool = True, + **kwargs, + ): + """ + Reader for psm_utils TSV format. + + Parameters + ---------- + filename + Path to PSM file. + fdr_threshold + FDR threshold for filtering PSMs. + only_targets + If True, only target PSMs are written to file. If False, both target and decoy PSMs + are written. + + """ + super().__init__(filename, *args, **kwargs) + + self.fdr_threshold = fdr_threshold + self.only_targets = only_targets + + self._open_file = None + self._writer = None + self.fieldnames = None + + def __enter__(self) -> FlashLFQWriter: + if Path(self.filename).is_file(): + # Get fieldnames from existing file + with open(self.filename, "rt") as open_file: + # Get fieldnames + self.fieldnames = open_file.readline().strip().split("\t") + mode = "at" + else: + # Set default fieldnames + self.fieldnames = [ + "File Name", + "Base Sequence", + "Full Sequence", + "Peptide Monoisotope Mass", + "Scan Retention Time", + "Precursor Charge", + "Protein Accessions", + ] + mode = "wt" + + # Open file and writer + self._open_file = open(self.filename, mode, newline="") + self._writer = csv.DictWriter( + self._open_file, + fieldnames=self.fieldnames, + extrasaction="ignore", + delimiter="\t", + ) + + if mode == "wt": + self._writer.writeheader() + + return self + + def __exit__(self, *args, **kwargs) -> None: + self._open_file.close() + self._open_file = None + self._writer = None + + def write_psm(self, psm: PSM): + """ + Write a single PSM to new or existing PSM file. + + Parameters + ---------- + psm + PSM object to write. + + """ + if psm.qvalue and psm.qvalue > self.fdr_threshold: + return + if self.only_targets and psm.is_decoy: + return + + entry = self._psm_to_entry(psm) + try: + self._writer.writerow(entry) + except AttributeError as e: + raise PSMUtilsIOException( + f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" + "is opened in context (i.e., using the `with` statement)." + ) from e + + def write_file(self, psm_list: PSMList): + """ + Write an entire PSMList to a new PSM file. + + Parameters + ---------- + psm_list + PSMList object to write to file. + + """ + # Filter out decoys + if self.only_targets: + # Accept both None and False + target_mask = np.array([not psm.is_decoy for psm in psm_list]) + LOGGER.debug(f"Skipping {~target_mask.sum()} decoy PSMs for FlashLFQ file.") + else: + target_mask = np.ones(len(psm_list), dtype=bool) + + # Filter out PSMs above FDR threshold + if any(psm.qvalue is None for psm in psm_list): + LOGGER.warning( + "Not all PSMs have a q-value. Skipping FDR filtering for FlashLFQ file." + ) + fdr_mask = np.ones(len(psm_list), dtype=bool) + else: + fdr_mask = psm_list["qvalue"] <= self.fdr_threshold + filtered_by_fdr = (~fdr_mask & target_mask).sum() + LOGGER.debug(f"Skipping {filtered_by_fdr} PSMs above FDR threshold for FlashLFQ file.") + + filtered_psm_list = psm_list[target_mask & fdr_mask] + + with open(self.filename, "wt", newline="") as f: + writer = csv.DictWriter( + f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" + ) + writer.writeheader() + for psm in filtered_psm_list: + writer.writerow(self._psm_to_entry(psm)) + + @staticmethod + def _psm_to_entry(psm: PSM) -> dict: + """Convert :py:class:`~psm_utils.psm.PSM` to FlashLFQ TSV entry.""" + return { + "File Name": psm.run, + "Base Sequence": psm.peptidoform.sequence, + "Full Sequence": psm.peptidoform.modified_sequence, + "Peptide Monoisotope Mass": psm.peptidoform.theoretical_mass, + "Scan Retention Time": psm.retention_time, + "Precursor Charge": psm.peptidoform.precursor_charge, + "Protein Accessions": ";".join(psm.protein_list), + } diff --git a/tests/test_io/test_flashlfq.py b/tests/test_io/test_flashlfq.py new file mode 100644 index 0000000..2e80ba1 --- /dev/null +++ b/tests/test_io/test_flashlfq.py @@ -0,0 +1,109 @@ +from unittest.mock import mock_open, patch + +import pytest + +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.io.flashlfq import FlashLFQReader, FlashLFQWriter +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +# Sample data for testing +sample_tsv_data = """File Name\tBase Sequence\tFull Sequence\tPeptide Monoisotope Mass\tScan Retention Time\tPrecursor Charge\tProtein Accessions +sample1.raw\tPEPTIDE\tPEPTIDE\t1000.0\t5.0\t2\tP12345;P67890 +sample2.raw\tPEPTIDE\tPEPTIDE\t1000.0\t10.0\t2\tP23456|P78901 +""" + + +@pytest.fixture +def valid_psm(): + return PSM( + peptidoform="PEPTIDE/2", + spectrum_id="0", + run="sample1.raw", + retention_time=5.0, + protein_list=["P12345", "P67890"], + ) + + +@pytest.fixture +def invalid_psm_entry(): + return { + "Full Sequence": "PEPTIDE", + "Precursor Charge": "2", + "File Name": "sample1.raw", + "Scan Retention Time": "5.0", + "Protein Accessions": None, + } + + +def test_flashlfqreader_parse_entry(valid_psm): + reader = FlashLFQReader("dummy_file.tsv") + entry = { + "Full Sequence": "PEPTIDE", + "Precursor Charge": "2", + "File Name": "sample1.raw", + "Scan Retention Time": "5.0", + "Protein Accessions": "P12345;P67890", + } + psm = reader._parse_entry(entry, spectrum_id="0") + assert psm == valid_psm + + +def test_flashlfqreader_iterate_over_file(): + with patch("builtins.open", mock_open(read_data=sample_tsv_data)): + reader = FlashLFQReader("dummy_file.tsv") + psms = list(reader) + assert len(psms) == 2 + assert psms[0].run == "sample1.raw" + assert psms[1].run == "sample2.raw" + + +def test_flashlfqreader_invalid_entry_handling(): + invalid_data = """File Name\tBase Sequence\tPeptide Monoisotope Mass\tScan Retention Time\tPrecursor Charge\tProtein Accessions +sample1.raw\tPEPTIDE\t1000.0\t5.0 +sample2.raw\tPEPTIDE\t1000.0\t10.0 +sample3.raw\tPEPTIDE\t1000.0\t15.0 +""" + with patch("builtins.open", mock_open(read_data=invalid_data)): + reader = FlashLFQReader("dummy_file.tsv") + with pytest.raises(PSMUtilsIOException): + psms = list(reader) # noqa: F841 + + +def test_flashlfqwriter_write_psm(valid_psm): + with patch("builtins.open", mock_open()) as mocked_file: + with FlashLFQWriter("dummy_file.tsv") as writer: + writer.write_psm(valid_psm) + mocked_file().write.assert_called() + assert "PEPTIDE" in mocked_file().write.call_args[0][0] + + +def test_flashlfqwriter_write_file(valid_psm): + psm_list = PSMList(psm_list=[valid_psm, valid_psm]) + with patch("builtins.open", mock_open()) as mocked_file: + with FlashLFQWriter("dummy_file.tsv") as writer: + writer.write_file(psm_list) + mocked_file().write.assert_called() + assert mocked_file().write.call_count == 4 # One for header, two for entries, final newline + + +def test_flashlfqwriter_existing_file(): + # Simulate a file that already exists + with patch("builtins.open", mock_open(read_data=sample_tsv_data)): + with FlashLFQWriter("dummy_file.tsv") as writer: + assert writer.fieldnames == [ + "File Name", + "Base Sequence", + "Full Sequence", + "Peptide Monoisotope Mass", + "Scan Retention Time", + "Precursor Charge", + "Protein Accessions", + ] + + +def test_flashlfqwriter_context_manager(): + with patch("builtins.open", mock_open()): + with FlashLFQWriter("dummy_file.tsv") as writer: + assert writer._open_file is not None + assert writer._open_file is None # Ensure file is closed after context exit