Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adopt cutesv error handling #320

Merged
merged 19 commits into from
Apr 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/mavis/breakpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,11 +310,12 @@ def column(self, colname: str):
return self.data.get(COLUMNS[colname])

def __str__(self):
return 'BPP({}, {}{}{})'.format(
return 'BPP({}, {}{}{}{})'.format(
str(self.break1),
str(self.break2),
', opposing={}'.format(self.opposing_strands) if not self.stranded else '',
', seq=' + repr(self.untemplated_seq) if self.untemplated_seq is not None else '',
', tracking_id=' + self.tracking_id if self.tracking_id else '',
)

def flatten(self):
Expand Down
128 changes: 106 additions & 22 deletions src/mavis/convert/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

from ..interval import Interval
import pandas as pd

try:
Expand Down Expand Up @@ -37,6 +37,7 @@ class VcfInfoType(TypedDict, total=False):
CHR2: str
CIPOS: Tuple[int, int]
CIEND: Tuple[int, int]
CILEN: Tuple[int, int]
CT: str
END: Optional[int]
PRECISE: bool
Expand Down Expand Up @@ -122,6 +123,98 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]:
raise NotImplementedError('alt specification in unexpected format', alt)


def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int):
"""
Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields.

bp1_s = breakpoint1 start
bp1_e = breakpoint1 end
bp2_s = breakpoint2 start
bp2_e = breakpoint2 end

Insertion and deletion edge case - in which bp1_e > bp2_s
E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900.
break1 ------------------------=======================--------------
break2 ------------------------==========---------------------------

Insertion edge case - in which bp1_e > bp1_s
E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800.
break1 ------------------------==-----------------------------------
break2 ------------------------=------------------------------------

Insertion edge case - in which bp1_s > bp2_s
E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000.
break1 ------------------------==-----------------------------------
break2 -----------------------========------------------------------
"""
is_intrachromosomal = std_row["break1_chromosome"] == std_row["break2_chromosome"]

if record.info.get(
'CILEN'
): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2
std_row.update(
{
COLUMNS.break1_position_start: max(
1, record.pos + record.info.get('CIPOS', (0, 0))[0]
),
COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1],
COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CILEN', (0, 0))[0]),
COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1],
}
)
else:
std_row.update(
{
COLUMNS.break1_position_start: max(
1, record.pos + record.info.get('CIPOS', (0, 0))[0]
),
COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1],
COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CIEND', (0, 0))[0]),
COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1],
}
)

if is_intrachromosomal and (
Interval.overlaps(
(std_row['break1_position_start'], std_row['break1_position_end']),
(std_row['break2_position_start'], std_row['break2_position_end']),
)
):
if std_row.get('event_type') != 'BND':
std_row['break2_position_start'] = max(
std_row['break1_position_start'], std_row['break2_position_start']
)
std_row['break1_position_end'] = min(
std_row['break1_position_end'], std_row['break2_position_end']
)
std_row['break1_position_start'] = min(
std_row['break1_position_start'], std_row['break2_position_end']
)
if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1:
# addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294
std_row.update({'break1_position_end': 1})
if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1:
std_row.update({'break2_position_end': 1})

if is_intrachromosomal and (
std_row['break2_position_start'] > std_row['break2_position_end']
or std_row['break1_position_start'] > std_row['break1_position_end']
):
if 'event_type' in std_row and std_row['event_type'] != 'BND':
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: FYI you could just do std_row.get('event_type') != 'BND' here if you wanted

raise ValueError(
f'Improper entry. One of the following breakpoints start is greater than breakpoint end: Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]} Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]} This call has been dropped.'
)

if (
None not in (record.pos, record.info.get('END'))
and record.pos > record.info.get('END')
and is_intrachromosomal
):
raise ValueError(
f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}). This call has been dropped.'
)


def convert_record(record: VcfRecordType) -> List[Dict]:
"""
converts a vcf record
Expand Down Expand Up @@ -181,6 +274,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]:
elif size < 0:
std_row[COLUMNS.event_type] = SVTYPE.DEL
std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2})

if 'SVTYPE' in info:
std_row[COLUMNS.event_type] = info['SVTYPE']

if info.get(
'PRECISE', False
): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged
Expand All @@ -193,24 +290,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]:
}
)
else:
std_row.update(
{
COLUMNS.break1_position_start: max(
1, record.pos + info.get('CIPOS', (0, 0))[0]
),
COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1],
COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]),
COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1],
}
)
if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1:
# addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294
std_row.update({'break1_position_end': 1})
if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1:
std_row.update({'break2_position_end': 1})

if 'SVTYPE' in info:
std_row[COLUMNS.event_type] = info['SVTYPE']
convert_imprecise_breakend(std_row, record, end)

try:
orient1, orient2 = info['CT'].split('to')
Expand All @@ -220,7 +300,11 @@ def convert_record(record: VcfRecordType) -> List[Dict]:
except KeyError:
pass
std_row.update(
{k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}}
{
k: v
for k, v in info.items()
if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CILEN', 'CT'}
}
)
records.append(std_row)
return records
Expand All @@ -238,7 +322,7 @@ def parse_info(info_field):

# convert info types
for key in info:
if key in {'CIPOS', 'CIEND'}:
if key in {'CIPOS', 'CIEND', 'CILEN'}:
ci_start, ci_end = info[key].split(',')
info[key] = (int(ci_start), int(ci_end))
elif key == 'END':
Expand Down Expand Up @@ -327,6 +411,6 @@ def convert_file(input_file: str) -> List[Dict]:
for variant_record in convert_pandas_rows_to_variants(data):
try:
rows.extend(convert_record(variant_record))
except NotImplementedError as err:
except (NotImplementedError, ValueError) as err:
logging.warning(str(err))
return rows
4 changes: 3 additions & 1 deletion src/mavis/summary/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ def group_events(events):
bpp.break2.strand != new_bpp.break2.strand,
]
):
raise AssertionError('cannot group events differing on key elements', bpp, new_bpp)
raise AssertionError(
'cannot group events differing on key elements', str(bpp), str(new_bpp)
)

# Note: There are some attributes that shouldn't be lost if different, currently appending the information
# The evidence could be better off as a max instead of a join
Expand Down
115 changes: 114 additions & 1 deletion tests/test_mavis/convert/test_tools_vcf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from mavis.convert import SUPPORTED_TOOL, _convert_tool_row
from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf

Expand All @@ -10,7 +11,7 @@ def test_read_vcf():
assert df.shape[0] == 106


def test_convert_record():
def test_convert_telomeric_region():
variant_imprecise = VcfRecordType(
id='mock-BND-imprecise',
pos=0,
Expand Down Expand Up @@ -59,3 +60,115 @@ def test_convert_record():

assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'
assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'


TEST_POS = 1853407


@pytest.mark.parametrize(
'pos,break1_ci,break2_ci,break1,break2,ids',
[
[
TEST_POS,
(-30, 30),
(-65, 65),
(TEST_POS - 30, TEST_POS + 30),
(TEST_POS - 30, TEST_POS + 65),
'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start',
],
[
TEST_POS,
(-30, 99999),
(-10, 65),
(TEST_POS - 30, TEST_POS + 65),
(TEST_POS - 10, TEST_POS + 65),
'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end',
],
],
ids=[
'breakpoint_2_start < breakpoint_1_start',
'breakpoint_1_end > breakpoint_2_end',
],
)
def test_convert_intrachromosomal_imprecise_breakend(
pos, break1_ci, break2_ci, break1, break2, ids
):
variant_vcf = VcfRecordType(
id=ids,
pos=pos,
chrom='chr5',
alts=['AGG'],
ref='A',
info=VcfInfoType(
CHR2="chr5",
IMPRECISE=True,
SVMETHOD="cuteSV-1.0.12",
SVTYPE="INS",
CIPOS=break1_ci,
CILEN=break2_ci,
),
)
result = convert_record(variant_vcf)
assert len(result) == 1
variant = result[0]
assert variant.get('break1_position_start') == break1[0]
assert variant.get('break1_position_end') == break1[1]
assert variant.get('break2_position_start') == break2[0]
assert variant.get('break2_position_end') == break2[1]


@pytest.mark.parametrize(
'pos,break1_ci,break2_ci,break1,break2,ids',
[
[
TEST_POS,
(-30, 99999),
(70, 65),
(TEST_POS - 30, TEST_POS + 65),
(TEST_POS + 65, TEST_POS + 65),
'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end',
],
],
ids=[
'breakpoint_2_start > breakpoint_2_end',
],
)
def test_error_on_convert_intrachromosomal_imprecise_breakend(
pos, break1_ci, break2_ci, break1, break2, ids
):
variant_vcf = VcfRecordType(
id=ids,
pos=pos,
chrom='chr5',
alts=['AGG'],
ref='A',
info=VcfInfoType(
CHR2="chr5",
IMPRECISE=True,
SVMETHOD="cuteSV-1.0.12",
SVTYPE="INS",
CIPOS=break1_ci,
CILEN=break2_ci,
),
)
with pytest.raises(ValueError):
convert_record(variant_vcf)


def test_convert_intrachromosomal_imprecise_breakend_no_ci():
# breakpoint_1_start > breakpoint_1_end
variant_cilen4 = VcfRecordType(
id='Sniffle.INS',
pos=11184,
chrom='chr2',
alts=['AGG'],
ref='N',
info=VcfInfoType(
CHR2="chr2",
IMPRECISE=True,
SVTYPE="INS",
END=11183,
),
)
with pytest.raises(ValueError):
convert_record(variant_cilen4)