Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: overloaded 'SeqDict.from_sam()' method to support SAM files (#176) #183

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 46 additions & 15 deletions fgpyo/fasta/sequence_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from dataclasses import field
from dataclasses import replace
from enum import unique
from pathlib import Path
from typing import Any
from typing import Dict
from typing import Iterator
Expand All @@ -137,6 +138,8 @@
from typing import Union
from typing import overload

from fgpyo import sam

if sys.version_info[0] == 3 and sys.version_info[1] < 11:
from strenum import StrEnum
else:
Expand Down Expand Up @@ -214,7 +217,7 @@
class SequenceMetadata(MutableMapping[Union[Keys, str], str]):
"""Stores information about a single Sequence (ex. chromosome, contig).

Implements the mutable mapping interface, which provide access to the attributes of this
Implements the mutable mapping interface, which provides access to the attributes of this
sequence, including name, length, but not index. When using the mapping interface, for example
getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will
always be strings (`str` type). For example, the length will be an `str` when accessing via
Expand Down Expand Up @@ -446,28 +449,56 @@

@staticmethod
@overload
def from_sam(header: pysam.AlignmentHeader) -> "SequenceDictionary": ...
def from_sam(data: Path) -> "SequenceDictionary": ...

@staticmethod
@overload
def from_sam(header: List[Dict[str, Any]]) -> "SequenceDictionary": ...
def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ...

@staticmethod
def from_sam(
header: Union[pysam.AlignmentHeader, List[Dict[str, Any]]],
) -> "SequenceDictionary":
"""Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from
the list of sequences returned by `pysam.AlignmentHeader#to_dict()["SQ"]`."""
if isinstance(header, pysam.AlignmentHeader):
return SequenceDictionary.from_sam(header=header.to_dict()["SQ"])
@overload
def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ...

infos: List[SequenceMetadata] = [
SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(header)
]
@staticmethod
@overload
def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ...

return SequenceDictionary(infos=infos)
@staticmethod
def from_sam(
data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]],
) -> "SequenceDictionary":
"""Creates a `SequenceDictionary` from a SAM file or its header.

# TODO: mypyp doesn't like these
Args:
data: The input may be any of:
- a path to a SAM file
- an open `pysam.AlignmentFile`
- the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile`
- the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()`
Returns:
A `SequenceDictionary` mapping refrence names to their metadata.
"""
seq_dict: SequenceDictionary
if isinstance(data, pysam.AlignmentHeader):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is begging for a structural pattern (class pattern) but I think our min python version is 3.9 :/

seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"])
elif isinstance(data, pysam.AlignmentFile):
seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"])
elif isinstance(data, Path):
with sam.reader(data) as fh:
seq_dict = SequenceDictionary.from_sam(fh.header)
else: # assuming `data` is a `list[dict[str, Any]]`
try:
infos: List[SequenceMetadata] = [
SequenceMetadata.from_sam(meta=meta, index=index)
for index, meta in enumerate(data)
]
seq_dict = SequenceDictionary(infos=infos)
except Exception as e:
raise ValueError(f"Could not parse sequence information from data: {data}") from e

Check warning on line 497 in fgpyo/fasta/sequence_dictionary.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/fasta/sequence_dictionary.py#L496-L497

Added lines #L496 - L497 were not covered by tests
Comment on lines +496 to +497
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid including raw data in exception messages.

Including the entire data object in the error message may lead to verbose logs or expose sensitive information. Consider providing a more general error message.

Apply this diff to adjust the error message:

-except Exception as e:
-    raise ValueError(f"Could not parse sequence information from data: {data}") from e
+except Exception as e:
+    raise ValueError("Could not parse sequence information from provided data.") from e
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except Exception as e:
raise ValueError(f"Could not parse sequence information from data: {data}") from e
except Exception as e:
raise ValueError("Could not parse sequence information from provided data.") from e


return seq_dict

# TODO: mypy doesn't like these
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FIXME

# @overload
# def __getitem__(self, key: str) -> SequenceMetadata: ...
#
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ warn_unused_configs = true
warn_unused_ignores = true
enable_error_code = "ignore-without-code"
exclude = ["site/", "docs/"]

[tool.coverage.report]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

exclude_lines = [
"pragma: not covered",
"@overload"
]
12 changes: 7 additions & 5 deletions tests/fgpyo/fasta/test_sequence_dictionary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from typing import Any
from typing import Dict
from typing import List
Expand All @@ -11,6 +12,8 @@
from fgpyo.fasta.sequence_dictionary import SequenceDictionary
from fgpyo.fasta.sequence_dictionary import SequenceMetadata
from fgpyo.fasta.sequence_dictionary import Topology
from fgpyo.sam import builder
from fgpyo.sam import reader


def test_alternate_locus_raises_start_gt_end() -> None:
Expand Down Expand Up @@ -315,10 +318,6 @@ def test_sequence_dictionary_same_as() -> None:
assert not this.same_as(that)


# to_sam
# from_sam


def test_sequence_dictionary_to_and_from_sam() -> None:
sd = SequenceDictionary(
infos=[
Expand All @@ -333,7 +332,10 @@ def test_sequence_dictionary_to_and_from_sam() -> None:
header = pysam.AlignmentHeader.from_dict(
header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]}
)

samfile: Path = builder.SamBuilder(sd=mapping).to_path()
alignment: pysam.AlignmentFile = reader(samfile)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use with a with statement so it automatically closes.

assert SequenceDictionary.from_sam(samfile) == sd
assert SequenceDictionary.from_sam(alignment) == sd
assert SequenceDictionary.from_sam(mapping) == sd
assert SequenceDictionary.from_sam(header) == sd
assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]})
Expand Down
Loading