Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate set_pair_info and _set_mate_info for set_mate_info #202

Merged
merged 20 commits into from
Dec 28, 2024
Merged
Changes from 9 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 172 additions & 21 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
@@ -162,6 +162,7 @@
from pathlib import Path
from typing import IO
from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterable
from typing import Iterator
@@ -176,6 +177,7 @@
from pysam import AlignedSegment
from pysam import AlignmentFile as SamFile
from pysam import AlignmentHeader as SamHeader
from typing_extensions import deprecated

import fgpyo.io
from fgpyo.collections import PeekableIterator
@@ -265,7 +267,8 @@
assert file_type is not None, "Must specify file_type when writing to standard output"
path = sys.stdout
else:
file_type = file_type or SamFileType.from_path(path)
if file_type is None:
file_type = SamFileType.from_path(path)
path = str(path)
elif not isinstance(path, _IOClasses): # type: ignore[unreachable]
open_type = "reading" if open_for_reading else "writing"
@@ -606,6 +609,110 @@
return self.elements[index]


@enum.unique
class PairOrientation(enum.Enum):
"""Enumerations of read pair orientations."""

FR = "FR"
"""A pair orientation for forward-reverse reads ("innie")."""

RF = "RF"
"""A pair orientation for reverse-forward reads ("outie")."""

TANDEM = "TANDEM"
"""A pair orientation for tandem (forward-forward or reverse-reverse) reads."""

@classmethod
def build(
cls, r1: AlignedSegment, r2: Optional[AlignedSegment] = None
) -> Optional["PairOrientation"]:
"""Returns the pair orientation if both reads are mapped to the same reference sequence.

Args:
r1: The first read in the template.
r2: The second read in the template. If undefined, mate data set upon R1 will be used.

See:
[`htsjdk.samtools.SamPairUtil.getPairOrientation()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L71-L102)
"""

if r2 is None:
r2_is_unmapped = r1.mate_is_unmapped
r2_reference_id = r1.next_reference_id
else:
r2_is_unmapped = r2.is_unmapped
r2_reference_id = r2.reference_id

if r1.is_unmapped or r2_is_unmapped or r1.reference_id != r2_reference_id:
return None

if r2 is None:
if not r1.has_tag("MC"):
raise ValueError('Cannot determine proper pair status without R2\'s cigar ("MC")!')
r2_cigar = Cigar.from_cigarstring(str(r1.get_tag("MC")))
r2_is_forward = r1.mate_is_forward
r2_reference_start = r1.next_reference_start
r2_reference_end = r1.next_reference_start + r2_cigar.length_on_target()
else:
r2_is_forward = r2.is_forward
r2_reference_start = r2.reference_start
r2_reference_end = r2.reference_end

if r1.is_forward is r2_is_forward:
return PairOrientation.TANDEM
elif r1.is_forward and r1.reference_start < r2_reference_end:
return PairOrientation.FR
elif r1.is_reverse and r2_reference_start < r1.reference_end:
return PairOrientation.FR
else:
return PairOrientation.RF


DefaultProperlyPairedOrientations = {PairOrientation.FR}
"""The default orientations for properly paired reads."""


def is_proper_pair(
r1: AlignedSegment,
r2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
orientations: set[PairOrientation] = DefaultProperlyPairedOrientations,
) -> bool:
"""Determines if a read pair is properly paired or not.

Criteria for reads in a proper pair are:
- Both reads are aligned
- Both reads are aligned to the same reference sequence
- The pair orientation of the reads is a part of the valid pair orientations (default "FR")
- The inferred insert size is not more than a maximum length (default 1000)

Args:
r1: The first read in the template.
r2: The second read in the template. If undefined, mate data set upon R1 will be used.
max_insert_size: The maximum insert size to consider a read pair "proper".
orientations: The valid set of orientations to consider a read pair "proper".

See:
[`htsjdk.samtools.SamPairUtil.isProperPair()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125)
"""
if r2 is None:
r2_is_mapped = r1.mate_is_mapped
r2_reference_id = r1.next_reference_id
else:
r2_is_mapped = r2.is_mapped
r2_reference_id = r2.reference_id

return (
r1.is_mapped
and r2_is_mapped
and r1.reference_id == r2_reference_id
and PairOrientation.build(r1, r2) in orientations
# TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size`
# which can only be done if isize() is modified to allow for an optional R2.
and 0 < abs(r1.template_length) <= max_insert_size
)


@attr.s(frozen=True, auto_attribs=True)
class SupplementaryAlignment:
"""Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag.
@@ -697,38 +804,82 @@
return r2_pos - r1_pos


def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int:
"""Calculate the sum of base qualities score for an alignment record.

This function is useful for calculating the "mate score" as implemented in samtools fixmate.

Args:
rec: The alignment record to calculate the sum of base qualities from.
min_quality_score: The minimum base quality score to use for summation.

See:
[`calc_sum_of_base_qualities()`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L227-L238)
[`MD_MIN_QUALITY`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L42)
"""
score: int = sum(qual for qual in rec.query_qualities if qual >= min_quality_score)
return score


def set_mate_info(
r1: AlignedSegment,
r2: AlignedSegment,
is_proper_pair: Callable[[AlignedSegment, AlignedSegment], bool] = is_proper_pair,
) -> None:
"""Resets mate pair information between reads in a pair.

Args:
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
is_proper_pair: A function that takes the two alignments and determines proper pair status.
"""
if r1.query_name != r2.query_name:
raise ValueError("Cannot set mate info on alignments with different query names!")

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_name = src.reference_name
dest.next_reference_start = src.reference_start
dest.mate_is_forward = src.is_forward
dest.mate_is_mapped = src.is_mapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

r1.set_tag("ms", sum_of_base_qualities(r2))
r2.set_tag("ms", sum_of_base_qualities(r1))

template_length = isize(r1, r2)
r1.template_length = template_length
r2.template_length = -template_length

proper_pair = is_proper_pair(r1, r2)
r1.is_proper_pair = proper_pair
r2.is_proper_pair = proper_pair


@deprecated("Use `set_mate_info()` instead. Deprecated as of fgpyo 0.8.0.")
def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None:
"""Resets mate pair information between reads in a pair. Requires that both r1
and r2 are mapped. Can be handed reads that already have pairing flags setup or
independent R1 and R2 records that are currently flagged as SE reads.
"""Resets mate pair information between reads in a pair.

Requires that both r1 and r2 are mapped. Can be handed reads that already have pairing flags
setup or independent R1 and R2 records that are currently flagged as SE reads.

Args:
r1: read 1
r2: read 2 with the same queryname as r1
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
proper_pair: whether the pair is proper or not.
"""
assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
assert r1.query_name == r2.query_name, "Attempting to pair reads with different qnames."
if r1.query_name != r2.query_name:
raise ValueError("Cannot pair reads with different query names!")

Check warning on line 873 in fgpyo/sam/__init__.py

Codecov / codecov/patch

fgpyo/sam/__init__.py#L873

Added line #L873 was not covered by tests

for r in [r1, r2]:
r.is_paired = True
r.is_proper_pair = proper_pair

r1.is_read1 = True
r1.is_read2 = False
r2.is_read2 = True
r2.is_read1 = False

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = False
dest.set_tag("MC", src.cigarstring)

insert_size = isize(r1, r2)
r1.template_length = insert_size
r2.template_length = -insert_size
set_mate_info(r1=r1, r2=r2, is_proper_pair=is_proper_pair)

Check warning on line 882 in fgpyo/sam/__init__.py

Codecov / codecov/patch

fgpyo/sam/__init__.py#L882

Added line #L882 was not covered by tests


@attr.s(frozen=True, auto_attribs=True)
78 changes: 11 additions & 67 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
@@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.

Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.

Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
@@ -444,8 +380,16 @@ def add_pair(
raise ValueError("Cannot use chrom in combination with chrom1 or chrom2")

chrom = sam.NO_REF_NAME if chrom is None else chrom
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
chrom2 = next(c for c in (chrom2, chrom) if c is not None)

if start1 != sam.NO_REF_POS:
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
else:
chrom1 = sam.NO_REF_NAME

if start2 != sam.NO_REF_POS:
chrom2 = next(c for c in (chrom2, chrom) if c is not None)
else:
chrom2 = sam.NO_REF_NAME

if chrom1 == sam.NO_REF_NAME and start1 != sam.NO_REF_POS:
raise ValueError("start1 cannot be used on its own - specify chrom or chrom1")
@@ -468,7 +412,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
Loading