Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a function for determining read pair orientation #201

Merged
merged 13 commits into from
Dec 27, 2024
121 changes: 117 additions & 4 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
import enum
import io
import sys
from collections.abc import Collection
from itertools import chain
from pathlib import Path
from typing import IO
Expand Down Expand Up @@ -607,6 +608,118 @@
return self.elements[index]


@enum.unique
class PairOrientation(enum.Enum):
"""Enumerations of read pair orientations."""

FR = "FR"
"""A pair orientation for forward-reverse reads ("innie")."""

RF = "RF"
"""A pair orientation for reverse-forward reads ("outie")."""

TANDEM = "TANDEM"
"""A pair orientation for tandem (forward-forward or reverse-reverse) reads."""

@classmethod
def from_recs( # noqa: C901 # `from_recs` is too complex (11 > 10)
clintval marked this conversation as resolved.
Show resolved Hide resolved
cls, rec1: AlignedSegment, rec2: Optional[AlignedSegment] = None
) -> Optional["PairOrientation"]:
"""Returns the pair orientation if both reads are mapped to the same reference sequence.

Args:
rec1: The first record in the pair.
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.

See:
[`htsjdk.samtools.SamPairUtil.getPairOrientation()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L71-L102)
"""

if rec2 is None:
rec2_is_unmapped = rec1.mate_is_unmapped
rec2_reference_id = rec1.next_reference_id
else:
rec2_is_unmapped = rec2.is_unmapped
rec2_reference_id = rec2.reference_id

if rec1.is_unmapped or rec2_is_unmapped or rec1.reference_id != rec2_reference_id:
return None

if rec2 is None:
rec2_is_forward = rec1.mate_is_forward
rec2_reference_start = rec1.next_reference_start
else:
rec2_is_forward = rec2.is_forward
rec2_reference_start = rec2.reference_start

if rec1.is_forward is rec2_is_forward:
return PairOrientation.TANDEM
if rec1.is_forward and rec1.reference_start <= rec2_reference_start:
return PairOrientation.FR
if rec1.is_reverse and rec2_reference_start < rec1.reference_end:
return PairOrientation.FR
if rec1.is_reverse and rec2_reference_start >= rec1.reference_end:
return PairOrientation.RF

if rec2 is None:
if not rec1.has_tag("MC"):
raise ValueError('Cannot determine pair orientation without a mate cigar ("MC")!')
rec2_cigar = Cigar.from_cigarstring(str(rec1.get_tag("MC")))
rec2_reference_end = rec1.next_reference_start + rec2_cigar.length_on_target()
else:
rec2_reference_end = rec2.reference_end

if rec1.reference_start < rec2_reference_end:
return PairOrientation.FR
else:
return PairOrientation.RF


DefaultProperlyPairedOrientations: set[PairOrientation] = {PairOrientation.FR}
"""The default orientations for properly paired reads."""


def is_proper_pair(
rec1: AlignedSegment,
rec2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
orientations: Collection[PairOrientation] = DefaultProperlyPairedOrientations,
) -> bool:
"""Determines if a pair of records are properly paired or not.

Criteria for records in a proper pair are:
- Both records are aligned
- Both records are aligned to the same reference sequence
- The pair orientation of the records is one of the valid pair orientations (default "FR")
- The inferred insert size is not more than a maximum length (default 1000)

Args:
rec1: The first record in the pair.
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.
max_insert_size: The maximum insert size to consider a pair "proper".
orientations: The valid set of orientations to consider a pair "proper".

See:
[`htsjdk.samtools.SamPairUtil.isProperPair()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125)
"""
if rec2 is None:
rec2_is_mapped = rec1.mate_is_mapped
rec2_reference_id = rec1.next_reference_id
else:
rec2_is_mapped = rec2.is_mapped
rec2_reference_id = rec2.reference_id

return (
rec1.is_mapped
and rec2_is_mapped
and rec1.reference_id == rec2_reference_id
and PairOrientation.from_recs(rec1=rec1, rec2=rec2) in orientations
# TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size`
# which can only be done if isize() is modified to allow for an optional R2.
and 0 < abs(rec1.template_length) <= max_insert_size
)


@attr.s(frozen=True, auto_attribs=True)
class SupplementaryAlignment:
"""Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag.
Expand Down Expand Up @@ -707,9 +820,8 @@
r1: read 1
r2: read 2 with the same queryname as r1
"""
assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
assert r1.query_name == r2.query_name, "Attempting to pair reads with different qnames."
if r1.query_name != r2.query_name:
raise ValueError("Cannot set pair info on reads with different query names!")

Check warning on line 824 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L824

Added line #L824 was not covered by tests
clintval marked this conversation as resolved.
Show resolved Hide resolved

for r in [r1, r2]:
r.is_paired = True
Expand All @@ -724,8 +836,9 @@
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = False
dest.mate_is_unmapped = src.mate_is_unmapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

insert_size = isize(r1, r2)
r1.template_length = insert_size
Expand Down
223 changes: 223 additions & 0 deletions tests/fgpyo/sam/test_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from fgpyo.sam import CigarElement
from fgpyo.sam import CigarOp
from fgpyo.sam import CigarParsingException
from fgpyo.sam import PairOrientation
from fgpyo.sam import SamFileType
from fgpyo.sam import is_proper_pair
from fgpyo.sam.builder import SamBuilder


Expand Down Expand Up @@ -300,6 +302,227 @@ def test_is_clipping() -> None:
assert clips == [CigarOp.S, CigarOp.H]


def test_pair_orientation_build_with_r2() -> None:
"""Test that we can build all pair orientations with R1 and R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.RF
assert PairOrientation.from_recs(r1) is PairOrientation.RF
assert PairOrientation.from_recs(r2) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r2) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM
assert PairOrientation.from_recs(r2) is PairOrientation.TANDEM


def test_pair_orientation_is_fr_if_opposite_directions_and_overlapping() -> None:
"""Test the pair orientation is always FR if the reads overlap and are oriented opposite."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR

builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_a_single_bp_alignment_at_end_of_rec_one_is_still_fr_orientations() -> None:
"""Test a single bp alignment at the end of a mate's alignment is still FR based on rec1."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=5, cigar1="5M", start2=5, cigar2="1M")
assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR
assert PairOrientation.from_recs(r1) is PairOrientation.FR
assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_pair_orientation_build_with_either_unmapped() -> None:
"""Test that we can return None with either R1 and R2 unmapped (or both)."""
builder = SamBuilder()
r1, r2 = builder.add_pair()
assert r1.is_unmapped
assert r2.is_unmapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start1=100)
assert r1.is_mapped
assert r2.is_unmapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start2=100)
assert r1.is_unmapped
assert r2.is_mapped
assert PairOrientation.from_recs(r1, r2) is None
assert PairOrientation.from_recs(r1) is None
assert PairOrientation.from_recs(r2) is None


def test_pair_orientation_build_with_no_r2_but_r2_mapped() -> None:
"""Test that we can build all pair orientations with R1 and no R2, but R2 is mapped."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.from_recs(r1) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.from_recs(r1) is PairOrientation.TANDEM


def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag_positive_fr() -> None:
"""Test that an exception is raised if we cannot find the mate cigar tag."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=16, cigar1="10M", start2=15, cigar2="10M")
sam.set_pair_info(r1, r2)
r1.set_tag("MC", None) # Clear out the MC tag.
r2.set_tag("MC", None) # Clear out the MC tag.

assert PairOrientation.from_recs(r1, r2) is PairOrientation.FR

with pytest.raises(ValueError):
PairOrientation.from_recs(r1)

assert PairOrientation.from_recs(r2) is PairOrientation.FR


def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag_positive_rf() -> None:
"""Test that an exception is raised if we cannot find the mate cigar tag."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=16, cigar1="1M", start2=15, cigar2="1M")
sam.set_pair_info(r1, r2)

assert PairOrientation.from_recs(r1, r2) is PairOrientation.RF

r1.set_tag("MC", None) # Clear out the MC tag.
r2.set_tag("MC", None) # Clear out the MC tag.

with pytest.raises(ValueError):
PairOrientation.from_recs(r1)

assert PairOrientation.from_recs(r2) is PairOrientation.RF


def test_is_proper_pair_when_actually_proper() -> None:
"""Test that is_proper_pair returns True when reads are properly paired."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert is_proper_pair(r1, r2)


def test_is_proper_pair_when_actually_proper_and_no_r2() -> None:
"""Test that is_proper_pair returns True when reads are properly paired, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert is_proper_pair(r1)


def test_not_is_proper_pair_if_wrong_orientation() -> None:
"""Test that reads are not properly paired if they are not in the right orientation."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)


def test_not_is_proper_pair_if_wrong_orientation_and_no_r2() -> None:
"""Test reads are not properly paired if they are not in the right orientation, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1)


def test_not_is_proper_pair_if_too_far_apart() -> None:
"""Test that reads are not properly paired if they are too far apart."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100 + 1000)
sam.set_pair_info(r1, r2)
assert not is_proper_pair(r1, r2)


def test_isize() -> None:
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
Expand Down
Loading