-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: overloaded 'SeqDict.from_sam()' method to support SAM files (#176) #183
base: main
Are you sure you want to change the base?
Changes from 6 commits
9073752
d97a926
ecc3931
78a9035
90b54d8
45f7eca
c69320f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -126,6 +126,7 @@ | |||||||||
from dataclasses import field | ||||||||||
from dataclasses import replace | ||||||||||
from enum import unique | ||||||||||
from pathlib import Path | ||||||||||
from typing import Any | ||||||||||
from typing import Dict | ||||||||||
from typing import Iterator | ||||||||||
|
@@ -137,6 +138,8 @@ | |||||||||
from typing import Union | ||||||||||
from typing import overload | ||||||||||
|
||||||||||
from fgpyo import sam | ||||||||||
|
||||||||||
if sys.version_info[0] == 3 and sys.version_info[1] < 11: | ||||||||||
from strenum import StrEnum | ||||||||||
else: | ||||||||||
|
@@ -214,7 +217,7 @@ | |||||||||
class SequenceMetadata(MutableMapping[Union[Keys, str], str]): | ||||||||||
"""Stores information about a single Sequence (ex. chromosome, contig). | ||||||||||
|
||||||||||
Implements the mutable mapping interface, which provide access to the attributes of this | ||||||||||
Implements the mutable mapping interface, which provides access to the attributes of this | ||||||||||
sequence, including name, length, but not index. When using the mapping interface, for example | ||||||||||
getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will | ||||||||||
always be strings (`str` type). For example, the length will be an `str` when accessing via | ||||||||||
|
@@ -446,28 +449,56 @@ | |||||||||
|
||||||||||
@staticmethod | ||||||||||
@overload | ||||||||||
def from_sam(header: pysam.AlignmentHeader) -> "SequenceDictionary": ... | ||||||||||
def from_sam(data: Path) -> "SequenceDictionary": ... | ||||||||||
|
||||||||||
@staticmethod | ||||||||||
@overload | ||||||||||
def from_sam(header: List[Dict[str, Any]]) -> "SequenceDictionary": ... | ||||||||||
def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... | ||||||||||
|
||||||||||
@staticmethod | ||||||||||
def from_sam( | ||||||||||
header: Union[pysam.AlignmentHeader, List[Dict[str, Any]]], | ||||||||||
) -> "SequenceDictionary": | ||||||||||
"""Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from | ||||||||||
the list of sequences returned by `pysam.AlignmentHeader#to_dict()["SQ"]`.""" | ||||||||||
if isinstance(header, pysam.AlignmentHeader): | ||||||||||
return SequenceDictionary.from_sam(header=header.to_dict()["SQ"]) | ||||||||||
@overload | ||||||||||
def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... | ||||||||||
|
||||||||||
infos: List[SequenceMetadata] = [ | ||||||||||
SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(header) | ||||||||||
] | ||||||||||
@staticmethod | ||||||||||
@overload | ||||||||||
def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... | ||||||||||
|
||||||||||
return SequenceDictionary(infos=infos) | ||||||||||
@staticmethod | ||||||||||
def from_sam( | ||||||||||
data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]], | ||||||||||
) -> "SequenceDictionary": | ||||||||||
"""Creates a `SequenceDictionary` from a SAM file or its header. | ||||||||||
|
||||||||||
# TODO: mypyp doesn't like these | ||||||||||
Args: | ||||||||||
data: The input may be any of: | ||||||||||
- a path to a SAM file | ||||||||||
- an open `pysam.AlignmentFile` | ||||||||||
- the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile` | ||||||||||
- the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()` | ||||||||||
Returns: | ||||||||||
A `SequenceDictionary` mapping refrence names to their metadata. | ||||||||||
""" | ||||||||||
seq_dict: SequenceDictionary | ||||||||||
if isinstance(data, pysam.AlignmentHeader): | ||||||||||
seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"]) | ||||||||||
elif isinstance(data, pysam.AlignmentFile): | ||||||||||
seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"]) | ||||||||||
elif isinstance(data, Path): | ||||||||||
with sam.reader(data) as fh: | ||||||||||
seq_dict = SequenceDictionary.from_sam(fh.header) | ||||||||||
else: # assuming `data` is a `list[dict[str, Any]]` | ||||||||||
try: | ||||||||||
infos: List[SequenceMetadata] = [ | ||||||||||
SequenceMetadata.from_sam(meta=meta, index=index) | ||||||||||
for index, meta in enumerate(data) | ||||||||||
] | ||||||||||
seq_dict = SequenceDictionary(infos=infos) | ||||||||||
except Exception as e: | ||||||||||
raise ValueError(f"Could not parse sequence information from data: {data}") from e | ||||||||||
Comment on lines
+496
to
+497
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Avoid including raw data in exception messages. Including the entire Apply this diff to adjust the error message: -except Exception as e:
- raise ValueError(f"Could not parse sequence information from data: {data}") from e
+except Exception as e:
+ raise ValueError("Could not parse sequence information from provided data.") from e 📝 Committable suggestion
Suggested change
|
||||||||||
|
||||||||||
return seq_dict | ||||||||||
|
||||||||||
# TODO: mypy doesn't like these | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FIXME |
||||||||||
# @overload | ||||||||||
# def __getitem__(self, key: str) -> SequenceMetadata: ... | ||||||||||
# | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,3 +88,9 @@ warn_unused_configs = true | |
warn_unused_ignores = true | ||
enable_error_code = "ignore-without-code" | ||
exclude = ["site/", "docs/"] | ||
|
||
[tool.coverage.report] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
exclude_lines = [ | ||
"pragma: not covered", | ||
"@overload" | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from pathlib import Path | ||
from typing import Any | ||
from typing import Dict | ||
from typing import List | ||
|
@@ -11,6 +12,8 @@ | |
from fgpyo.fasta.sequence_dictionary import SequenceDictionary | ||
from fgpyo.fasta.sequence_dictionary import SequenceMetadata | ||
from fgpyo.fasta.sequence_dictionary import Topology | ||
from fgpyo.sam import builder | ||
from fgpyo.sam import reader | ||
|
||
|
||
def test_alternate_locus_raises_start_gt_end() -> None: | ||
|
@@ -315,10 +318,6 @@ def test_sequence_dictionary_same_as() -> None: | |
assert not this.same_as(that) | ||
|
||
|
||
# to_sam | ||
# from_sam | ||
|
||
|
||
def test_sequence_dictionary_to_and_from_sam() -> None: | ||
sd = SequenceDictionary( | ||
infos=[ | ||
|
@@ -333,7 +332,10 @@ def test_sequence_dictionary_to_and_from_sam() -> None: | |
header = pysam.AlignmentHeader.from_dict( | ||
header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]} | ||
) | ||
|
||
samfile: Path = builder.SamBuilder(sd=mapping).to_path() | ||
alignment: pysam.AlignmentFile = reader(samfile) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use with a |
||
assert SequenceDictionary.from_sam(samfile) == sd | ||
assert SequenceDictionary.from_sam(alignment) == sd | ||
assert SequenceDictionary.from_sam(mapping) == sd | ||
assert SequenceDictionary.from_sam(header) == sd | ||
assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is begging for a structural pattern (class pattern) but I think our min python version is 3.9 :/