From 9073752fe8f145ccf2af19cee516dc59a72827dd Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 3 Sep 2024 18:12:41 -0400 Subject: [PATCH 1/8] feat: overloaded 'SeqDict.from_sam()' method to support SAM files - in addition to pysam SAM headers and SAM header dicts, SeqDicts can now be initialized from SAM file paths and pysam SAM files - added basic tests for this new functionality --- fgpyo/fasta/sequence_dictionary.py | 34 ++++++++++++++----- tests/fgpyo/fasta/data/sequence.dict | 4 +++ tests/fgpyo/fasta/test_sequence_dictionary.py | 12 ++++--- 3 files changed, 36 insertions(+), 14 deletions(-) create mode 100644 tests/fgpyo/fasta/data/sequence.dict diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 56e068bf..519ec11e 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -126,6 +126,7 @@ from dataclasses import field from dataclasses import replace from enum import unique +from pathlib import Path from typing import Any from typing import Dict from typing import Iterator @@ -137,6 +138,8 @@ from typing import Union from typing import overload +from fgpyo import sam + if sys.version_info[0] == 3 and sys.version_info[1] < 11: from strenum import StrEnum else: @@ -214,7 +217,7 @@ def parse(value: str) -> "AlternateLocus": class SequenceMetadata(MutableMapping[Union[Keys, str], str]): """Stores information about a single Sequence (ex. chromosome, contig). - Implements the mutable mapping interface, which provide access to the attributes of this + Implements the mutable mapping interface, which provides access to the attributes of this sequence, including name, length, but not index. When using the mapping interface, for example getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will always be strings (`str` type). For example, the length will be an `str` when accessing via @@ -446,28 +449,41 @@ def to_sam_header( @staticmethod @overload - def from_sam(header: pysam.AlignmentHeader) -> "SequenceDictionary": ... + def from_sam(data: Path) -> "SequenceDictionary": ... + + @staticmethod + @overload + def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... + + @staticmethod + @overload + def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... @staticmethod @overload - def from_sam(header: List[Dict[str, Any]]) -> "SequenceDictionary": ... + def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... @staticmethod def from_sam( - header: Union[pysam.AlignmentHeader, List[Dict[str, Any]]], + data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]], ) -> "SequenceDictionary": """Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from - the list of sequences returned by `pysam.AlignmentHeader#to_dict()["SQ"]`.""" - if isinstance(header, pysam.AlignmentHeader): - return SequenceDictionary.from_sam(header=header.to_dict()["SQ"]) + the list of sequences returned by `pysam.AlignmentHeader.to_dict()["SQ"]`.""" + if isinstance(data, pysam.AlignmentHeader): + return SequenceDictionary.from_sam(data.to_dict()["SQ"]) + if isinstance(data, pysam.AlignmentFile): + return SequenceDictionary.from_sam(data.header.to_dict()["SQ"]) + if isinstance(data, Path): + with sam.reader(data, file_type=sam.SamFileType.SAM) as fh: + return SequenceDictionary.from_sam(fh.header) infos: List[SequenceMetadata] = [ - SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(header) + SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(data) ] return SequenceDictionary(infos=infos) - # TODO: mypyp doesn't like these + # TODO: mypy doesn't like these # @overload # def __getitem__(self, key: str) -> SequenceMetadata: ... # diff --git a/tests/fgpyo/fasta/data/sequence.dict b/tests/fgpyo/fasta/data/sequence.dict new file mode 100644 index 00000000..1d5edd8a --- /dev/null +++ b/tests/fgpyo/fasta/data/sequence.dict @@ -0,0 +1,4 @@ +@HD VN:1.5 +@SQ SN:chr1 LN:10 +@SQ SN:chr2 LN:20 AN:chr3 +@RG ID:foo diff --git a/tests/fgpyo/fasta/test_sequence_dictionary.py b/tests/fgpyo/fasta/test_sequence_dictionary.py index 493f1d14..91c72c4c 100644 --- a/tests/fgpyo/fasta/test_sequence_dictionary.py +++ b/tests/fgpyo/fasta/test_sequence_dictionary.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Any from typing import Dict from typing import List @@ -11,6 +12,8 @@ from fgpyo.fasta.sequence_dictionary import SequenceDictionary from fgpyo.fasta.sequence_dictionary import SequenceMetadata from fgpyo.fasta.sequence_dictionary import Topology +from fgpyo.sam import SamFileType +from fgpyo.sam import reader def test_alternate_locus_raises_start_gt_end() -> None: @@ -315,10 +318,6 @@ def test_sequence_dictionary_same_as() -> None: assert not this.same_as(that) -# to_sam -# from_sam - - def test_sequence_dictionary_to_and_from_sam() -> None: sd = SequenceDictionary( infos=[ @@ -333,7 +332,10 @@ def test_sequence_dictionary_to_and_from_sam() -> None: header = pysam.AlignmentHeader.from_dict( header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]} ) - + samfile = Path(__file__).parent / "data" / "sequence.dict" + alignment: pysam.AlignmentFile = reader(samfile, file_type=SamFileType.SAM) + assert SequenceDictionary.from_sam(samfile) == sd + assert SequenceDictionary.from_sam(alignment) == sd assert SequenceDictionary.from_sam(mapping) == sd assert SequenceDictionary.from_sam(header) == sd assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]}) From d97a926203647672a5e3e750277b1ed47c097d76 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 4 Sep 2024 13:13:25 -0400 Subject: [PATCH 2/8] fix(codecov): added 'pragma no cover' to overloaded function --- fgpyo/fasta/sequence_dictionary.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 519ec11e..5e85b6c2 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -449,19 +449,19 @@ def to_sam_header( @staticmethod @overload - def from_sam(data: Path) -> "SequenceDictionary": ... + def from_sam(data: Path) -> "SequenceDictionary": ... # pragma: no cover @staticmethod @overload - def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... + def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... # pragma: no cover @staticmethod @overload - def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... + def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... # pragma: no cover @staticmethod @overload - def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... + def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... # pragma: no cover @staticmethod def from_sam( From ecc3931be1b13cf28c615ec131864dd37a5dd373 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 14 Jan 2025 14:04:32 -0500 Subject: [PATCH 3/8] fix: remove no cover pragma, add config to pyproject.toml --- fgpyo/fasta/sequence_dictionary.py | 8 ++++---- pyproject.toml | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 5e85b6c2..519ec11e 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -449,19 +449,19 @@ def to_sam_header( @staticmethod @overload - def from_sam(data: Path) -> "SequenceDictionary": ... # pragma: no cover + def from_sam(data: Path) -> "SequenceDictionary": ... @staticmethod @overload - def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... # pragma: no cover + def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... @staticmethod @overload - def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... # pragma: no cover + def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... @staticmethod @overload - def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... # pragma: no cover + def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... @staticmethod def from_sam( diff --git a/pyproject.toml b/pyproject.toml index 801f94b7..4bf5e6b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,3 +88,8 @@ warn_unused_configs = true warn_unused_ignores = true enable_error_code = "ignore-without-code" exclude = ["site/", "docs/"] + +[tool.coverage.report] +exclude_lines = + pragma: not covered + @overload From 78a90357fc7dff73bd27cdd29a934b787bff07d7 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 14 Jan 2025 14:09:34 -0500 Subject: [PATCH 4/8] fix: update 'from_sam()' docstring --- fgpyo/fasta/sequence_dictionary.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 519ec11e..18456d07 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -467,8 +467,17 @@ def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... def from_sam( data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]], ) -> "SequenceDictionary": - """Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from - the list of sequences returned by `pysam.AlignmentHeader.to_dict()["SQ"]`.""" + """Creates a `SequenceDictionary` from a SAM file or its header. + + Args: + data: The input may be any of: + - a path to a SAM file + - an open `pysam.AlignmentFile` + - the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile` + - the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()` + Returns: + A `SequenceDictionary` mapping refrence names to their metadata. + """ if isinstance(data, pysam.AlignmentHeader): return SequenceDictionary.from_sam(data.to_dict()["SQ"]) if isinstance(data, pysam.AlignmentFile): From 90b54d800b3e9c9f25ae79db624cb9d2196b4fe2 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 14 Jan 2025 14:23:37 -0500 Subject: [PATCH 5/8] fix: codecov syntax --- pyproject.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4bf5e6b3..f1e7c30c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ enable_error_code = "ignore-without-code" exclude = ["site/", "docs/"] [tool.coverage.report] -exclude_lines = - pragma: not covered - @overload +exclude_lines = [ + "pragma: not covered", + "@overload" +] From 45f7ecaa020e3b69ad19b942903d834291b36776 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 14 Jan 2025 14:48:29 -0500 Subject: [PATCH 6/8] fix: style, remove test file in favor of SamBuilder --- fgpyo/fasta/sequence_dictionary.py | 30 +++++++++++-------- tests/fgpyo/fasta/data/sequence.dict | 4 --- tests/fgpyo/fasta/test_sequence_dictionary.py | 6 ++-- 3 files changed, 21 insertions(+), 19 deletions(-) delete mode 100644 tests/fgpyo/fasta/data/sequence.dict diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 18456d07..2ed9e542 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -478,19 +478,25 @@ def from_sam( Returns: A `SequenceDictionary` mapping refrence names to their metadata. """ + seq_dict: SequenceDictionary if isinstance(data, pysam.AlignmentHeader): - return SequenceDictionary.from_sam(data.to_dict()["SQ"]) - if isinstance(data, pysam.AlignmentFile): - return SequenceDictionary.from_sam(data.header.to_dict()["SQ"]) - if isinstance(data, Path): - with sam.reader(data, file_type=sam.SamFileType.SAM) as fh: - return SequenceDictionary.from_sam(fh.header) - - infos: List[SequenceMetadata] = [ - SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(data) - ] - - return SequenceDictionary(infos=infos) + seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"]) + elif isinstance(data, pysam.AlignmentFile): + seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"]) + elif isinstance(data, Path): + with sam.reader(data) as fh: + seq_dict = SequenceDictionary.from_sam(fh.header) + else: # assuming `data` is a `list[dict[str, Any]]` + try: + infos: List[SequenceMetadata] = [ + SequenceMetadata.from_sam(meta=meta, index=index) + for index, meta in enumerate(data) + ] + seq_dict = SequenceDictionary(infos=infos) + except Exception as e: + raise ValueError(f"Could not parse sequence information from data: {data}") from e + + return seq_dict # TODO: mypy doesn't like these # @overload diff --git a/tests/fgpyo/fasta/data/sequence.dict b/tests/fgpyo/fasta/data/sequence.dict deleted file mode 100644 index 1d5edd8a..00000000 --- a/tests/fgpyo/fasta/data/sequence.dict +++ /dev/null @@ -1,4 +0,0 @@ -@HD VN:1.5 -@SQ SN:chr1 LN:10 -@SQ SN:chr2 LN:20 AN:chr3 -@RG ID:foo diff --git a/tests/fgpyo/fasta/test_sequence_dictionary.py b/tests/fgpyo/fasta/test_sequence_dictionary.py index 91c72c4c..318d14ef 100644 --- a/tests/fgpyo/fasta/test_sequence_dictionary.py +++ b/tests/fgpyo/fasta/test_sequence_dictionary.py @@ -12,7 +12,7 @@ from fgpyo.fasta.sequence_dictionary import SequenceDictionary from fgpyo.fasta.sequence_dictionary import SequenceMetadata from fgpyo.fasta.sequence_dictionary import Topology -from fgpyo.sam import SamFileType +from fgpyo.sam import builder from fgpyo.sam import reader @@ -332,8 +332,8 @@ def test_sequence_dictionary_to_and_from_sam() -> None: header = pysam.AlignmentHeader.from_dict( header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]} ) - samfile = Path(__file__).parent / "data" / "sequence.dict" - alignment: pysam.AlignmentFile = reader(samfile, file_type=SamFileType.SAM) + samfile: Path = builder.SamBuilder(sd=mapping).to_path() + alignment: pysam.AlignmentFile = reader(samfile) assert SequenceDictionary.from_sam(samfile) == sd assert SequenceDictionary.from_sam(alignment) == sd assert SequenceDictionary.from_sam(mapping) == sd From c69320faf52ddab4cbd2d59a17abd4284e06756a Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 14 Jan 2025 15:02:20 -0500 Subject: [PATCH 7/8] fix: added test for invalid input --- tests/fgpyo/fasta/test_sequence_dictionary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/fgpyo/fasta/test_sequence_dictionary.py b/tests/fgpyo/fasta/test_sequence_dictionary.py index 318d14ef..22fb1b61 100644 --- a/tests/fgpyo/fasta/test_sequence_dictionary.py +++ b/tests/fgpyo/fasta/test_sequence_dictionary.py @@ -339,6 +339,8 @@ def test_sequence_dictionary_to_and_from_sam() -> None: assert SequenceDictionary.from_sam(mapping) == sd assert SequenceDictionary.from_sam(header) == sd assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]}) + with pytest.raises(ValueError): + SequenceDictionary.from_sam([{}]) def test_sequence_dictionary_mapping() -> None: From 8512844cdc96b4ba1390dcc375f507292c68861a Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 16 Jan 2025 09:01:28 -0500 Subject: [PATCH 8/8] fix: close fh and remove obsolete comment --- fgpyo/fasta/sequence_dictionary.py | 7 ------- tests/fgpyo/fasta/test_sequence_dictionary.py | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 2ed9e542..0729ea39 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -498,13 +498,6 @@ def from_sam( return seq_dict - # TODO: mypy doesn't like these - # @overload - # def __getitem__(self, key: str) -> SequenceMetadata: ... - # - # @overload - # def __getitem__(self, index: int) -> SequenceMetadata: ... - def __getitem__(self, key: Union[str, int]) -> SequenceMetadata: return self._dict[key] if isinstance(key, str) else self.infos[key] diff --git a/tests/fgpyo/fasta/test_sequence_dictionary.py b/tests/fgpyo/fasta/test_sequence_dictionary.py index 22fb1b61..a5037afe 100644 --- a/tests/fgpyo/fasta/test_sequence_dictionary.py +++ b/tests/fgpyo/fasta/test_sequence_dictionary.py @@ -333,9 +333,9 @@ def test_sequence_dictionary_to_and_from_sam() -> None: header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]} ) samfile: Path = builder.SamBuilder(sd=mapping).to_path() - alignment: pysam.AlignmentFile = reader(samfile) + with reader(samfile) as alignment_fh: # pysam.AlignmentFile + assert SequenceDictionary.from_sam(alignment_fh) == sd assert SequenceDictionary.from_sam(samfile) == sd - assert SequenceDictionary.from_sam(alignment) == sd assert SequenceDictionary.from_sam(mapping) == sd assert SequenceDictionary.from_sam(header) == sd assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]})