Skip to content

Commit

Permalink
Merge pull request #46 from msaid-de/feature/pin-to-tsv-convert
Browse files Browse the repository at this point in the history
✨ draft of pin to tsv converter
  • Loading branch information
tkschmidt authored Sep 3, 2024
2 parents b666ef9 + 9197db9 commit 55ad062
Show file tree
Hide file tree
Showing 3 changed files with 268 additions and 0 deletions.
8 changes: 8 additions & 0 deletions mokapot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,18 @@ def _parser():
),
)

parser.add_argument(
"--verify_pin",
type=bool,
default=True,
help="Verify that PIN input files are valid TSVs. If not convert them.",
)

parser.add_argument(
"-d",
"--dest_dir",
type=Path,
default=Path("."),
help=(
"The directory in which to write the result files. Defaults to "
"the current working directory"
Expand Down
17 changes: 17 additions & 0 deletions mokapot/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
import sys
import time
import warnings
import shutil
from pathlib import Path

import numpy as np

from . import __version__
from .config import Config
from .parsers.pin import read_pin
from .parsers.pin_to_tsv import is_valid_tsv, pin_to_valid_tsv
from .parsers.fasta import read_fasta
from .brew import brew
from .model import PercolatorModel, load_model
Expand Down Expand Up @@ -55,6 +57,21 @@ def main(main_args=None):
logging.info("Command issued:")
logging.info("%s", " ".join(sys.argv))
logging.info("")

logging.info("Verify PIN format")
logging.info("=================")
if config.verify_pin:
for path_pin in config.psm_files:
with open(path_pin, 'r') as f_pin:
valid_tsv = is_valid_tsv(f_pin)
if not valid_tsv:
logging.info(f"{path_pin} invalid tsv, converting")
path_tsv = f"{path_pin}.tsv"
with open(path_pin, 'r') as f_pin:
with open(path_tsv, 'a') as f_tsv:
pin_to_valid_tsv(f_in=f_pin, f_out=f_tsv)
shutil.move(path_tsv, path_pin)

Check warning on line 73 in mokapot/mokapot.py

View check run for this annotation

Codecov / codecov/patch

mokapot/mokapot.py#L68-L73

Added lines #L68 - L73 were not covered by tests

logging.info("Starting Analysis")
logging.info("=================")

Expand Down
243 changes: 243 additions & 0 deletions mokapot/parsers/pin_to_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
from pathlib import Path
from io import StringIO
from typing import TextIO
from unittest.mock import Mock
import argparse

# PIN file specification from
# https://github.com/percolator/percolator/wiki/Interface#tab-delimited-file-format
"""
PSMId <tab> Label <tab> ScanNr <tab> feature1name <tab> ... <tab> featureNname <tab> Peptide <tab> Proteins
DefaultDirection <tab> - <tab> - <tab> feature1weight <tab> ... <tab> featureNweight [optional]
"""

EXAMPLE_PIN = """SpecId\tLabel\tScanNr\tExpMass\tPeptide\tProteins
target_0_16619_2_-1\t1\t16619\t750.4149\tK.SEFLVR.E\tsp|Q96QR8|PURB_HUMAN\tsp|Q00577|PURA_HUMAN
target_0_2025_2_-1\t1\t2025\t751.4212\tR.HTALGPR.S\tsp|Q9Y4H4|GPSM3_HUMAN"""
EXAMPLE_HEADER, EXAMPLE_LINE_1, EXAMPLE_LINE_2 = EXAMPLE_PIN.split('\n')

PIN_SEP = '\t'


def parse_pin_header_columns(
header: str,
sep_column: str = PIN_SEP,

) -> (int, int):
"""
Parse the header of a PIN file to get the number of columns and the index of the
Proteins column.
Parameters
----------
header : str
The header line from the PIN file.
sep_column : str, optional
Column separator (default is PIN_SEP).
Returns
-------
n_col : int
The total number of columns in the PIN file.
idx_protein_col : int
The index of the 'Proteins' column.
Examples
--------
>>> n_col, idx_protein_col = parse_pin_header_columns(EXAMPLE_HEADER)
>>> n_col, idx_protein_col
(6, 5)
"""
columns = header.strip().split(sep_column)
assert "Proteins" in columns
n_col = len(columns)
idx_protein_col = columns.index("Proteins")
return n_col, idx_protein_col

Check warning on line 55 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L51-L55

Added lines #L51 - L55 were not covered by tests


def convert_line_pin_to_tsv(
line: str,
idx_protein_col: int,
n_col: int,
sep_column: str = "\t",
sep_protein: str = ":"
):
"""
Convert a single line from a PIN file format to a TSV format.
Parameters
----------
line : str
A single line from the PIN file.
idx_protein_col : int
The index of the first protein column.
n_col : int
The total number of columns in the PIN file (excluding additional protein columns).
sep_column : str, optional
The separator used between columns (default is "\t").
sep_protein : str, optional
The separator to use between multiple proteins (default is ":").
Returns
-------
str
The converted line in TSV format.
Examples
--------
>>> header = EXAMPLE_HEADER
>>> n_col, idx_protein_col = parse_pin_header_columns(header)
>>> tsv_line = convert_line_pin_to_tsv(EXAMPLE_LINE_1, n_col=n_col, idx_protein_col=idx_protein_col)
>>> tsv_line.expandtabs(4) # needed for docstring to work
'target_0_16619_2_-1 1 16619 750.4149 K.SEFLVR.E sp|Q96QR8|PURB_HUMAN:sp|Q00577|PURA_HUMAN'
>>> tsv_line = convert_line_pin_to_tsv(EXAMPLE_LINE_2, n_col=n_col, idx_protein_col=idx_protein_col)
>>> tsv_line.expandtabs(4) # needed for docstring to work
'target_0_2025_2_-1 1 2025 751.4212 R.HTALGPR.S sp|Q9Y4H4|GPSM3_HUMAN'
"""
elements = line.split(sep=sep_column) # this contains columns and proteins
n_proteins = len(elements) - n_col
idx_prot_start = idx_protein_col
idx_prot_end = idx_protein_col + n_proteins + 1
proteins: str = sep_protein.join(elements[idx_prot_start:idx_prot_end])
columns: list = elements[:idx_prot_start] + [proteins] + elements[idx_prot_end:]
tsv_line: str = sep_column.join(columns)
return tsv_line

Check warning on line 104 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L97-L104

Added lines #L97 - L104 were not covered by tests


def is_valid_tsv(
f_in: TextIO,
sep_column: str = PIN_SEP
) -> bool:
"""
This function verifies that:
1. All rows have the same number of columns as the header row.
2. The file does not contain a "DefaultDirection" line as the second line.
Parameters
----------
f_in : TextIO
Input file object to read from. This should be an opened file or file-like
object that supports iteration.
sep_column : str, optional
Column separator (default is PIN_SEP, which is assumed to be a tab character).
Returns
-------
bool
True if the file is a valid TSV according to the specified criteria,
False otherwise.
Examples
--------
>>> input = StringIO(EXAMPLE_PIN)
>>> is_valid_tsv(input)
False
"""
n_col_header = len(next(f_in).split(sep_column))
line_2 = next(f_in)

# check for optional DefaultDirection line
if line_2.startswith("DefaultDirection"):
return False

Check warning on line 141 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L141

Added line #L141 was not covered by tests
n_col = len(line_2.split(sep_column))
if n_col != n_col_header:
return False

Check warning on line 144 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L144

Added line #L144 was not covered by tests

# check if sep_column is really only used for columns
for line in f_in:
n_col = len(line.split(sep_column))
if n_col != n_col_header:
return False

Check warning on line 150 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L150

Added line #L150 was not covered by tests
return True


def pin_to_valid_tsv(
f_in: TextIO,
f_out: TextIO,
sep_column: str = PIN_SEP,
sep_protein: str = ":"
) -> None:
"""
Convert a PIN file to a valid TSV file.
This assumes that the input file is in PIN format and that the first line
is a header. It preserves the header in the output file and ignores the second line
if it starts with "DefaultDirection".
Parameters
----------
f_in : TextIO
Input file object to read from.
f_out : TextIO
Output file object to write to.
sep_column : str, optional
Column separator (default is PIN_SEP).
sep_protein : str, optional
Protein separator (default is ":").
Returns
-------
None
Examples
--------
>>> mock_input = StringIO(EXAMPLE_PIN)
>>> mock_output = Mock()
>>> mock_output.write = Mock()
>>> pin_to_valid_tsv(mock_input, mock_output)
>>> mock_output.write.call_count
3
>>> mock_output.write.assert_any_call(EXAMPLE_HEADER + "\\n")
"""
header: str = next(f_in).strip()
f_out.write(header + "\n")
n_col, idx_protein_col = parse_pin_header_columns(header, sep_column)

Check warning on line 194 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L192-L194

Added lines #L192 - L194 were not covered by tests

# Optionally, the second line of a PIN file might declare DefaultDirection
# This is ignored with this conversion
# https://github.com/percolator/percolator/wiki/Interface#pintsv-tab-delimited-file-format
second_line = next(f_in).strip()

Check warning on line 199 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L199

Added line #L199 was not covered by tests

if not second_line.startswith("DefaultDirection"):
tsv_line: str = convert_line_pin_to_tsv(

Check warning on line 202 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L201-L202

Added lines #L201 - L202 were not covered by tests
second_line,
n_col=n_col,
idx_protein_col=idx_protein_col,
sep_column=sep_column,
sep_protein=sep_protein
)
f_out.write(tsv_line + "\n")

Check warning on line 209 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L209

Added line #L209 was not covered by tests

for line in f_in:
line = line.strip()
tsv_line: str = convert_line_pin_to_tsv(

Check warning on line 213 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L211-L213

Added lines #L211 - L213 were not covered by tests
line,
n_col=n_col,
idx_protein_col=idx_protein_col,
sep_column=sep_column,
sep_protein=sep_protein
)
f_out.write(tsv_line + "\n")

Check warning on line 220 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L220

Added line #L220 was not covered by tests


def main():
parser = argparse.ArgumentParser(description="Convert PIN file to valid TSV")
parser.add_argument("path_in", type=Path, help="Input PIN file path")
parser.add_argument("path_out", type=Path, help="Output TSV file path")
parser.add_argument("--sep_column", type=str, default="\t",

Check warning on line 227 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L224-L227

Added lines #L224 - L227 were not covered by tests
help="Column separator (default: '\\t')")
parser.add_argument("--sep_protein", type=str, default=":",

Check warning on line 229 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L229

Added line #L229 was not covered by tests
help="Protein separator (default: ':')")
args = parser.parse_args()
with open(args.path_in, 'r') as f_in:
with open(args.path_out, 'a') as f_out:
pin_to_valid_tsv(

Check warning on line 234 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L231-L234

Added lines #L231 - L234 were not covered by tests
f_in=f_in,
f_out=f_out,
sep_column=args.sep_column,
sep_protein=args.sep_protein
)


if __name__ == "__main__":
main()

Check warning on line 243 in mokapot/parsers/pin_to_tsv.py

View check run for this annotation

Codecov / codecov/patch

mokapot/parsers/pin_to_tsv.py#L243

Added line #L243 was not covered by tests

0 comments on commit 55ad062

Please sign in to comment.