Skip to content

Commit

Permalink
fix: don't make assumptions about contig names
Browse files Browse the repository at this point in the history
  • Loading branch information
msto committed Apr 4, 2024
1 parent c5e80e2 commit 7bb0999
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 24 deletions.
44 changes: 29 additions & 15 deletions pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@
from pybedlite.bed_source import BedSource
from pybedlite.bed_record import BedRecord

UCSC_STRAND_REGEX = re.compile(r".*\((\+|-)\)$")
"""
Match a parenthetically enclosed strand at the end of a position-formatted interval.
Groups:
1: the strand ("+" or "-")
"""

UCSC_INTERVAL_REGEX = re.compile(r"^(.*):(\d+)-(\d+)$")
"""
Match a position-formatted interval.
Groups:
1: the refname (or chromosome)
2: the 1-based start position
3: the 1-based closed end position
"""


@attr.s(frozen=True, auto_attribs=True)
class Interval:
Expand Down Expand Up @@ -127,10 +145,10 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval":
)

@classmethod
def from_ucsc_position(
def from_ucsc(
cls: Type["Interval"],
position: str,
name: str | None = None,
ucsc: str,
name: Optional[str] = None,
) -> "Interval":
"""
Construct an `Interval` from a UCSC "position"-formatted string.
Expand Down Expand Up @@ -164,25 +182,21 @@ def from_ucsc_position(
"""

# First, check to see if the strand is specified, and remove it from the string.
strand_re = re.compile(r".*\((\+|-)\)$")
strand_match = strand_re.match(position)

strand_match = UCSC_STRAND_REGEX.match(ucsc)
if strand_match is not None:
negative = strand_match.group(1) == "-"
position = position[:-3]
ucsc = ucsc[:-3]
else:
negative = False

# Then parse the location
position_re = re.compile(r"^(chr(\d+|X|Y|M|MT)(?:_[A-Za-z0-9]+_alt)?):(\d+)-(\d+)$")

match = position_re.match(position)
if match is None:
raise ValueError(f"Not a valid UCSC position-formatted string: {position}")
interval_match = UCSC_INTERVAL_REGEX.match(ucsc)
if interval_match is None:
raise ValueError(f"Not a valid UCSC position-formatted string: {ucsc}")

Check warning on line 195 in pybedlite/overlap_detector.py

View check run for this annotation

Codecov / codecov/patch

pybedlite/overlap_detector.py#L195

Added line #L195 was not covered by tests

refname = match.group(1)
start = int(match.group(3)) - 1
end = int(match.group(4))
refname = interval_match.group(1)
start = int(interval_match.group(2)) - 1
end = int(interval_match.group(3))

return cls(refname=refname, start=start, end=end, negative=negative, name=name)

Expand Down
24 changes: 15 additions & 9 deletions pybedlite/tests/test_overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pybedlite.overlap_detector import OverlapDetector
from pybedlite.bed_record import BedStrand
from pybedlite.bed_record import BedRecord
import pytest


def run_test(targets: List[Interval], query: Interval, results: List[Interval]) -> None:
Expand Down Expand Up @@ -190,20 +191,25 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None:
assert new_record.strand is record.strand


def test_construction_from_ucsc_position() -> None:
def test_construction_from_ucsc() -> None:
"""
Test that we can convert a UCSC position to an Interval and back.
"""

assert Interval.from_ucsc_position("chr1:101-200") == Interval("chr1", 100, 200)
assert Interval.from_ucsc_position("chr10_GL383545v1_alt:101-200") == Interval(
assert Interval.from_ucsc("chr1:101-200") == Interval("chr1", 100, 200)
assert Interval.from_ucsc("chr10_GL383545v1_alt:101-200") == Interval(
"chr10_GL383545v1_alt", 100, 200
)

# Check strand
assert Interval.from_ucsc_position("chr1:101-200(+)") == Interval(
"chr1", 100, 200, negative=False
)
assert Interval.from_ucsc_position("chr1:101-200(-)") == Interval(
"chr1", 100, 200, negative=True
)
assert Interval.from_ucsc("chr1:101-200(+)") == Interval("chr1", 100, 200, negative=False)
assert Interval.from_ucsc("chr1:101-200(-)") == Interval("chr1", 100, 200, negative=True)


@pytest.mark.parametrize("contig", ["chrUn_JTFH01001499v1_decoy", "HLA-DRB1*15:01:01:02"])
def test_construction_from_ucsc_other_contigs(contig: str) -> None:
"""
Test that we can construct an interval with non-human/decoy/custom/other contig names
"""

assert Interval.from_ucsc(f"{contig}:101-200") == Interval(contig, 100, 200)

0 comments on commit 7bb0999

Please sign in to comment.